# Set the script's path as working directory
parentfolder = rstudioapi::getActiveDocumentContext()$path
setwd(dirname(parentfolder))
parentfolder <- dirname(getwd())
data <- paste0(parentfolder, '/data/')
models <- paste0(parentfolder, '/models/')
plots <- paste0(parentfolder, '/plots/')
scripts <- paste0(parentfolder, '/scripts/')
# Load in packages
library(tidyverse) # data wrangling
library(gridExtra) # plots side by side
#library(iemisc) # for acos_d, which gives inverse cosine in degrees (standard in R is radians) # since 07.2024 not on CRAN
library(aspace) # alternative to iemisc
library(broom) # for tidy model outputs
library(brms)
# option for Bayesian regression models:
# use all available cores for parallel computing
options(mc.cores = parallel::detectCores())
library(HDInterval) # package for credible interval computation
library(tidybayes) # plotting
# library(brmstools) # forest plot but the package is no longer maintained
library(phonR) # for formant plots
# Use a colorblind-friendly palette
colorBlindBlack8 <- c("#000000", "#E69F00", "#56B4E9", "#009E73",
"#F0E442", "#0072B2", "#D55E00", "#CC79A7")
Check versions:
R.Version()$version.string
## [1] "R version 4.4.1 (2024-06-14 ucrt)"
packageVersion("tidyverse")
## [1] '2.0.0'
packageVersion("ggplot2")
## [1] '3.5.1'
packageVersion("brms")
## [1] '2.22.0'
Load data:
can = read_delim(paste0(data,'vowels-only.txt'), delim = '\t', col_names = TRUE, na = "NA")
Remove columns that are repetitive:
can <- can %>%
select(-file, -sound.name, -Fs, -gender)
Participants stand in front of a wall (approx. 1m distance) and are asked to shoot a can appearing on the wall. They should shoot it with a laser pointer by pointing the laser at the can, while uttering the word appearing on the can (piff|paff) at the same time. The can appears on different positions – on the 5x5 grid – and in two different sizes.
Combinations: 25 positions x 2 sizes x 2 words/vowels; N = 100 items per participant.
Overall duration ca. 10 minutes.
Marker coding:
Both hypotheses are grounded in the human body and seek to explain iconic prosody with physiological relationships.
Fundamental frequency is influenced by head position, specifically, that the more upward the head is rotated, the higher the fundamental frequency.
This hypothesis aims to investigate the missing link between the vertical position of an object and f0, and whether an upward-looking head position can drive the cross-modal iconic effect. We expect to find a change in f0 due to head movement since moving the head upwards engages the muscles around the larynx, which are responsible for f0 change.
The degree of jaw opening is influenced by the size of an object being named, with a larger object resulting in a larger degree of jaw opening.
Jaw opening is a physical factor that affects F1, and as such, it is used as the primary measurement in this study to assess the anatomical basis for iconicity. The relationship between F1 and object size has been demonstrated in speech perception, with a higher F1 perceived as coming from a larger source. However, this relationship has yet to be tested from the perspective of speech production.
The columns in the raw data file represent the following variables:
Here begins the actual data analysis.
Calculate distances in 3D:
can <- mutate(can,
dist.mg.f.3d = sqrt((x_f_mean - x_mg_mean)^2 + (y_f_mean - y_mg_mean)^2 + (z_f_mean - z_mg_mean)^2), # mid glasses to front
dist.f.c7.3d = sqrt((x_c7_mean - x_f_mean)^2 + (y_c7_mean - y_f_mean)^2 + (z_c7_mean - z_f_mean)^2), # front to back
dist.c7.mg.3d = sqrt((x_mg_mean - x_c7_mean)^2 + (y_mg_mean - y_c7_mean)^2 + (z_mg_mean - z_c7_mean)^2)) # back to mid glasses
Calculate angles in 3D:
can <- mutate(can,
angle.mg.3d = acos_d((dist.c7.mg.3d^2 + dist.mg.f.3d^2 - dist.f.c7.3d^2) / (2 * dist.c7.mg.3d * dist.mg.f.3d)), # mid glasses
angle.f.3d = acos_d((dist.mg.f.3d^2 + dist.f.c7.3d^2 - dist.c7.mg.3d^2) / (2 * dist.mg.f.3d * dist.f.c7.3d)), # front
angle.c7.3d = acos_d((dist.f.c7.3d^2 + dist.c7.mg.3d^2 - dist.mg.f.3d^2) / (2 * dist.f.c7.3d * dist.c7.mg.3d))) # back -- I use this one as this is the most stable point on the body
Select only necessary columns and rename them:
can <- can %>%
mutate(
f0_range = max_f0 - min_f0,
y_mg_mean = y_mg_mean*100
) %>%
select(
subject, age, hand, monolingual, part, size, v.pos, h.pos, text, start, duration,
overall_dB, f0_range, mean_f0, X50_F1, X50_F2, X50_F3,
height, dataset, y_mg_mean, LipDist, angle.c7.3d
) %>%
rename(
speaker = subject,
can_size = size,
v_pos = v.pos,
h_pos = h.pos,
vowel = text,
dB_mean = overall_dB,
f0_mean = mean_f0,
F1 = X50_F1,
F2 = X50_F2,
F3 = X50_F3,
head_pos = y_mg_mean,
angle = angle.c7.3d,
lip_dist = LipDist)
Look at the data now:
can
We do not remove outliers, as they may be valuables data points. However, we should remove erroneous measurements.
With speaker 16, we forgot to turn on the right microphone, therefore, the data is not trustworthy and we are forced to remove it.
can <- subset(can, speaker != '16')
Pätzold and Simpson (1997, 225) give the following formant values in read speach of female speakers for the vowels in question:
| F1 | F2 | F3 | |
| [ɪ] | 391 (350–442) | 2136 (1905–2348) | 2867 (2660–3026) |
| [a] | 751 (651–838) | 1460 (1346–1583) | 2841 (2679–2983) |
We decided to group the data per participant and per vowel and, within the grouped data, remove outliers individually for f0 range, f0 mean, f1, f2, and f3.
# Columns to process
columns_to_process <- c("dB_mean", "f0_range", "f0_mean", "F1", "F2", "F3")
# Group by Language and Participant and then loop through each column
can_grouped <- can %>%
group_by(speaker, vowel) %>%
mutate(across(all_of(columns_to_process), .fns = ~{
cat("Processing column:", deparse(substitute(.)), "\n")
# Calculate IQR
Q1 <- quantile(.x, 0.25, na.rm = TRUE)
Q3 <- quantile(.x, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
# Define lower and upper bounds for outliers
lower_bound <- Q1 - 2 * IQR
upper_bound <- Q3 + 2 * IQR
# Identify outliers
outliers <- .x < lower_bound | .x > upper_bound
# Replace outliers with NA
.x[outliers] <- NA
.x
})) %>%
ungroup()
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: dB_mean
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_range
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: f0_mean
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F1
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F2
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
## Processing column: F3
Check how the outlier removal impacted the data. First, the amount of NAs.
# Combine the results into a data frame
na_summary_phon <- data.frame(
NA_Count_Pre_Removal = colSums(is.na(can[, columns_to_process])),
NA_Count_Post_Removal = colSums(is.na(can_grouped[, columns_to_process])),
NA_Perc_Pre_Removal = (colSums(is.na(can[, columns_to_process])) / nrow(can)) * 100,
NA_Perc_Post_Removal = (colSums(is.na(can_grouped[, columns_to_process])) / nrow(can_grouped)) * 100
)
na_summary_phon
Then, look at the means. When looking at the table, it’s best to sort by speaker to see where there was a change.
# Calculate means for each variable in the pre-removal dataset
means_can <- can %>%
group_by(speaker, vowel) %>%
summarise(across(all_of(columns_to_process), mean, na.rm = TRUE)) %>%
ungroup()
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(all_of(columns_to_process), mean, na.rm = TRUE)`.
## ℹ In group 1: `speaker = 1` and `vowel = "I"`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
##
## # Previously
## across(a:b, mean, na.rm = TRUE)
##
## # Now
## across(a:b, \(x) mean(x, na.rm = TRUE))
## `summarise()` has grouped output by 'speaker'. You can override using the
## `.groups` argument.
# Calculate means for each variable in the post-removal dataset
means_can_grouped <- can_grouped %>%
group_by(speaker, vowel) %>%
summarise(across(all_of(columns_to_process), mean, na.rm = TRUE)) %>%
ungroup()
## `summarise()` has grouped output by 'speaker'. You can override using the
## `.groups` argument.
# Combine the means into a single summary table
outlier_summary_acc <- bind_rows(
tibble(Variable = "Pre-Removal", means_can),
tibble(Variable = "Post-Removal", means_can_grouped)
)
After inspecting the outlier removal, change the name back to can and remove the unnecessary data frames.
can <- can_grouped
rm(can_grouped, means_can, means_can_grouped)
rm(columns_to_process)
We will do the same, but for the kinematic variables. Here, we will only group by speaker.
# Columns to process
columns_to_process <- c("head_pos", "angle", "lip_dist")
# Group by Language and Participant and then loop through each column
can_grouped <- can %>%
group_by(speaker) %>%
mutate(across(all_of(columns_to_process), .fns = ~{
cat("Processing column:", deparse(substitute(.)), "\n")
# Calculate IQR
Q1 <- quantile(.x, 0.25, na.rm = TRUE)
Q3 <- quantile(.x, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
# Define lower and upper bounds for outliers
lower_bound <- Q1 - 2 * IQR
upper_bound <- Q3 + 2 * IQR
# Identify outliers
outliers <- .x < lower_bound | .x > upper_bound
# Replace outliers with NA
.x[outliers] <- NA
.x
})) %>%
ungroup()
## Processing column: head_pos
## Processing column: head_pos
## Processing column: head_pos
## Processing column: head_pos
## Processing column: head_pos
## Processing column: head_pos
## Processing column: head_pos
## Processing column: head_pos
## Processing column: head_pos
## Processing column: head_pos
## Processing column: head_pos
## Processing column: head_pos
## Processing column: head_pos
## Processing column: head_pos
## Processing column: head_pos
## Processing column: head_pos
## Processing column: head_pos
## Processing column: head_pos
## Processing column: head_pos
## Processing column: head_pos
## Processing column: head_pos
## Processing column: head_pos
## Processing column: head_pos
## Processing column: head_pos
## Processing column: head_pos
## Processing column: head_pos
## Processing column: head_pos
## Processing column: head_pos
## Processing column: head_pos
## Processing column: head_pos
## Processing column: angle
## Processing column: angle
## Processing column: angle
## Processing column: angle
## Processing column: angle
## Processing column: angle
## Processing column: angle
## Processing column: angle
## Processing column: angle
## Processing column: angle
## Processing column: angle
## Processing column: angle
## Processing column: angle
## Processing column: angle
## Processing column: angle
## Processing column: angle
## Processing column: angle
## Processing column: angle
## Processing column: angle
## Processing column: angle
## Processing column: angle
## Processing column: angle
## Processing column: angle
## Processing column: angle
## Processing column: angle
## Processing column: angle
## Processing column: angle
## Processing column: angle
## Processing column: angle
## Processing column: angle
## Processing column: lip_dist
## Processing column: lip_dist
## Processing column: lip_dist
## Processing column: lip_dist
## Processing column: lip_dist
## Processing column: lip_dist
## Processing column: lip_dist
## Processing column: lip_dist
## Processing column: lip_dist
## Processing column: lip_dist
## Processing column: lip_dist
## Processing column: lip_dist
## Processing column: lip_dist
## Processing column: lip_dist
## Processing column: lip_dist
## Processing column: lip_dist
## Processing column: lip_dist
## Processing column: lip_dist
## Processing column: lip_dist
## Processing column: lip_dist
## Processing column: lip_dist
## Processing column: lip_dist
## Processing column: lip_dist
## Processing column: lip_dist
## Processing column: lip_dist
## Processing column: lip_dist
## Processing column: lip_dist
## Processing column: lip_dist
## Processing column: lip_dist
## Processing column: lip_dist
Check how the outlier removal impacted the data. First, the amount of NAs.
# Combine the results into a data frame
na_summary_kin <- data.frame(
NA_Count_Pre_Removal = colSums(is.na(can[, columns_to_process])),
NA_Count_Post_Removal = colSums(is.na(can_grouped[, columns_to_process])),
NA_Perc_Pre_Removal = (colSums(is.na(can[, columns_to_process])) / nrow(can)) * 100,
NA_Perc_Post_Removal = (colSums(is.na(can_grouped[, columns_to_process])) / nrow(can_grouped)) * 100
)
na_summary_kin
Then, look at the means. When looking at the table, it’s best to sort by speaker to see where there was a change.
# Calculate means for each variable in the pre-removal dataset
means_can <- can %>%
group_by(speaker) %>%
summarise(across(all_of(columns_to_process), mean, na.rm = TRUE)) %>%
ungroup()
# Calculate means for each variable in the post-removal dataset
means_can_grouped <- can_grouped %>%
group_by(speaker) %>%
summarise(across(all_of(columns_to_process), mean, na.rm = TRUE)) %>%
ungroup()
# Combine the means into a single summary table
outlier_summary_kin <- bind_rows(
tibble(Variable = "Pre-Removal", means_can),
tibble(Variable = "Post-Removal", means_can_grouped)
)
After inspecting the outlier removal, change the name back to can and remove the unnecessary data frames.
can <- can_grouped
rm(can_grouped, means_can, means_can_grouped)
rm(columns_to_process)
# Create tibble with means and sd per speaker and vowel:
can %>%
na.omit() %>%
group_by(speaker, vowel) %>%
summarize_at(vars(dB_mean, f0_mean, f0_range, F1, F2, F3), list(mean = mean, sd = sd)) %>%
ungroup() %>%
{. ->> speakers_phon}
# Join tibble with means per speaker and vowel
can <- can %>%
left_join(speakers_phon, by = c("speaker", "vowel")) %>%
# Calculate z-scores
mutate(
db_mean_z = (dB_mean - dB_mean_mean) / dB_mean_sd,
f0_mean_z = (f0_mean - f0_mean_mean) / f0_mean_sd,
f0_range_z = (f0_range - f0_range_mean) / f0_range_sd,
F1_z = (F1 - F1_mean) / F1_sd,
F2_z = (F2 - F2_mean) / F2_sd,
F3_z = (F3 - F3_mean) / F3_sd
) %>%
# Deselect unneeded variables
select(-c(
dB_mean_sd, dB_mean_mean,
f0_mean_sd, f0_mean_mean,
f0_range_sd, f0_range_mean,
F1_sd, F1_mean,
F2_sd, F2_mean,
F3_sd, F3_mean
))
# Create tibble with means and sd per speaker and vowel for head_pos, lip_dist, and angle:
can %>%
na.omit() %>%
group_by(speaker, vowel) %>%
summarize_at(vars(head_pos, lip_dist, angle), list(mean = mean, sd = sd)) %>%
ungroup() %>%
{. ->> speakers_kin}
# Join tibble with means per speaker and vowel
can <- can %>%
left_join(speakers_kin, by = c("speaker", "vowel")) %>%
# Calculate z-scores
mutate(
head_pos_z = (head_pos - head_pos_mean) / head_pos_sd,
lip_dist_z = (lip_dist - lip_dist_mean) / lip_dist_sd,
angle_z = (angle - angle_mean) / angle_sd
) %>%
# deselect unneeded variables
select(-c(
head_pos_sd, head_pos_mean,
lip_dist_sd, lip_dist_mean,
angle_sd, angle_mean
))
People differ in the SD of the head position and thus head angle. This suggests there are some who move their head more, and some who move their head less.
The idea is to group speakers into movers and non-movers by looking at the SD of the mean head position:
can %>%
group_by(speaker) %>%
summarize_at(vars(head_pos), list(mean = ~mean(., na.rm = TRUE), sd = ~sd(., na.rm = TRUE))) %>%
{. ->> speakers_pos} %>%
ungroup() %>%
print(n = Inf)
## # A tibble: 30 Ă— 3
## speaker mean sd
## <dbl> <dbl> <dbl>
## 1 1 152. 2.00
## 2 2 160. 0.615
## 3 3 164. 0.747
## 4 4 162. 1.56
## 5 5 159. 0.557
## 6 6 155. 1.15
## 7 7 164. 1.66
## 8 8 156. 0.811
## 9 9 151. 0.879
## 10 10 150. 0.779
## 11 11 164. 0.969
## 12 12 154. 0.447
## 13 13 164. 0.403
## 14 14 163. 0.934
## 15 15 172. 1.73
## 16 17 157. 0.832
## 17 18 155. 0.791
## 18 19 156. 0.744
## 19 20 147. 1.56
## 20 21 159. 1.85
## 21 22 154. 1.64
## 22 23 151. 0.627
## 23 24 175. 1.21
## 24 25 170. 1.34
## 25 26 162. 2.58
## 26 27 169. 0.540
## 27 28 165. 1.06
## 28 29 176. 0.978
## 29 30 168. 1.23
## 30 31 160. 1.49
Check mean of SD to estimate point of division:
mean(speakers_pos$sd)
## [1] 1.123773
Group of movers with SD above 1.12:
filter(speakers_pos, sd > 1.123773)
N = 13
Group of non-movers with SD below 1.12:
filter(speakers_pos, sd < 1.123773)
N = 17
Add a categorical variable mover/non-mover:
can <- left_join(can,
can %>%
group_by(speaker) %>%
summarize_at(vars(head_pos),
list(mean_pos = ~ mean(., na.rm = TRUE),
sd_pos = ~ sd(., na.rm = TRUE)),
na.rm = TRUE) %>%
mutate(mover = ifelse(sd_pos < 1.123773, 'no', 'yes')) %>%
ungroup())
Calculate the difference between the actual angle and the speaker’s mean to plot it later:
can <- can %>%
mutate(pos_difference = head_pos - mean_pos) %>%
select(-c(sd_pos, mean_pos)) # deselect unneded variables
NAs are problematic for phonR, therefore I’m replacing them with a placeholder value
can_noNA <- can %>%
mutate(across(c(F1, F2, F3), ~ifelse(is.na(.), 9999, .)))
Add bark (= another type of normalized scale) values for formants
can_noNA <- mutate(can_noNA,
F1_bark = normBark(F1),
F2_bark = normBark(F2),
F3_bark = normBark(F3))
Replace back the placeholder value with NAs for formants in herz and also replace the respective cells of formants transofrmed to bark to NA.
can_noNA <- mutate(can_noNA,
across(c(F1, F2, F3), ~ifelse(. == 9999, NA, .)),
across(c(F1_bark, F2_bark, F3_bark),
~ifelse(is.na(get(substring(cur_column(), 1, 2))), NA, .)))
can <- can_noNA
rm(can_noNA)
We have to create a data frame where can sizes are aside each other for each condition to be able to calculate the distances.
# Pivot the data to create separate columns for each variable and category
can_formants_aside <- can %>%
pivot_wider(
id_cols = c(speaker, vowel, v_pos, h_pos),
names_from = can_size,
values_from = c(F1, F2, F3, lip_dist, F1_z, F2_z, F3_z, F1_bark, F2_bark, F3_bark),
names_sep = "_"
) %>%
# Rename columns for clarity
rename(
F1_large = F1_large,
F1_small = F1_small,
F2_large = F2_large,
F2_small = F2_small,
F3_large = F3_large,
F3_small = F3_small,
lip_dist_large = lip_dist_large,
lip_dist_small = lip_dist_small,
F1_z_large = F1_z_large,
F1_z_small = F1_z_small,
F2_z_large = F2_z_large,
F2_z_small = F2_z_small,
F3_z_large = F3_z_large,
F3_z_small = F3_z_small,
F1_bark_large = F1_bark_large,
F1_bark_small = F1_bark_small,
F2_bark_large = F2_bark_large,
F2_bark_small = F2_bark_small,
F3_bark_large = F3_bark_large,
F3_bark_small = F3_bark_small
)
# View the resulting dataframe
print(can_formants_aside)
## # A tibble: 1,500 Ă— 24
## speaker vowel v_pos h_pos F1_small F1_large F2_small F2_large F3_small
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 I 2 3 503. 589. 1871. 1804. 2789.
## 2 1 I 4 4 538. 564. 1845. 1555. 2623.
## 3 1 I 1 1 NA 616. 1876. 1686. 2765.
## 4 1 I 3 5 527. 552. 1680. 1862. 2610.
## 5 1 a 2 5 756. 769. 1520. 1796. 2770.
## 6 1 I 3 4 564. 616. 1971. 1881. 2834.
## 7 1 a 5 3 781. 858. 1899. 1553. 2496.
## 8 1 I 1 5 552. 562. 1580. 1923. 2543.
## 9 1 a 5 4 728. 810. 1513. NA 2609.
## 10 1 a 3 3 785. 760. 1522. 1544. 2487.
## # ℹ 1,490 more rows
## # ℹ 15 more variables: F3_large <dbl>, lip_dist_small <dbl>,
## # lip_dist_large <dbl>, F1_z_small <dbl>, F1_z_large <dbl>, F2_z_small <dbl>,
## # F2_z_large <dbl>, F3_z_small <dbl>, F3_z_large <dbl>, F1_bark_small <dbl>,
## # F1_bark_large <dbl>, F2_bark_small <dbl>, F2_bark_large <dbl>,
## # F3_bark_small <dbl>, F3_bark_large <dbl>
Calculate Euclidean distance between large and small can formants in the same condition:
can_formants_aside <- mutate(can_formants_aside,
dist_size_hz = sqrt((F1_large - F1_small)^2 + (F2_large - F2_small)^2),
dist_size_z = sqrt((F1_z_large - F1_z_small)^2 + (F2_z_large - F2_z_small)^2),
dist_size_bark = sqrt((F1_bark_large - F1_bark_small)^2 + (F2_bark_large - F2_bark_small)^2)
)
Contrast code vowel, can_size, and mover (the latter just in case) and store as separate vectors:
can <- can %>%
mutate(
vowel_s = if_else(vowel == "I", -0.5, 0.5),
can_size_s = if_else(can_size == "small", -0.5, 0.5),
mover_s = if_else(mover == "no", -0.5, 0.5)
)
How many speakers?
length(unique(can$speaker))
## [1] 30
How many unique realizations per speaker?
can %>%
group_by(speaker) %>%
dplyr::summarize(num_unique_realizations = n_distinct(paste(can_size, v_pos, h_pos, vowel))) %>%
ungroup() %>%
print(n = 30)
## # A tibble: 30 Ă— 2
## speaker num_unique_realizations
## <dbl> <int>
## 1 1 100
## 2 2 100
## 3 3 100
## 4 4 100
## 5 5 100
## 6 6 100
## 7 7 100
## 8 8 100
## 9 9 100
## 10 10 100
## 11 11 100
## 12 12 100
## 13 13 100
## 14 14 100
## 15 15 100
## 16 17 100
## 17 18 100
## 18 19 100
## 19 20 100
## 20 21 100
## 21 22 100
## 22 23 100
## 23 24 100
## 24 25 100
## 25 26 100
## 26 27 100
## 27 28 100
## 28 29 100
## 29 30 100
## 30 31 100
NAs in variables per speaker.
can %>%
group_by(speaker) %>%
mutate(
across(c(dB_mean, f0_mean, f0_range, F1, F2, F3, head_pos, lip_dist, angle),
~sum(is.na(.))),
.groups = 'drop'
) %>%
print(n = 30)
## # A tibble: 3,000 Ă— 40
## # Groups: speaker [30]
## speaker age hand monolingual part can_size v_pos h_pos vowel start
## <dbl> <dbl> <chr> <chr> <dbl> <chr> <dbl> <dbl> <chr> <dbl>
## 1 1 28 r yes 1 small 2 3 I 3.29
## 2 1 28 r yes 1 small 4 4 I 6.66
## 3 1 28 r yes 1 small 1 1 I 9.90
## 4 1 28 r yes 1 small 3 5 I 13.2
## 5 1 28 r yes 1 large 2 5 a 16.2
## 6 1 28 r yes 1 small 3 4 I 19.3
## 7 1 28 r yes 1 small 5 3 a 22.4
## 8 1 28 r yes 1 large 3 5 I 25.5
## 9 1 28 r yes 1 large 1 5 I 28.6
## 10 1 28 r yes 1 large 5 4 a 31.2
## 11 1 28 r yes 1 small 3 3 a 33.8
## 12 1 28 r yes 1 large 3 4 I 36.4
## 13 1 28 r yes 1 large 1 3 I 39.1
## 14 1 28 r yes 1 large 5 3 I 42.0
## 15 1 28 r yes 1 small 4 1 I 44.6
## 16 1 28 r yes 1 small 1 2 I 47.4
## 17 1 28 r yes 1 large 2 1 a 51.5
## 18 1 28 r yes 1 large 3 4 a 54.8
## 19 1 28 r yes 1 small 5 2 a 57.8
## 20 1 28 r yes 1 large 4 5 I 61.2
## 21 1 28 r yes 1 small 5 1 a 64.1
## 22 1 28 r yes 1 large 2 1 I 67.1
## 23 1 28 r yes 1 large 4 1 a 70.1
## 24 1 28 r yes 1 large 1 4 I 73.2
## 25 1 28 r yes 1 small 4 4 a 76.3
## 26 1 28 r yes 1 large 5 5 a 79.2
## 27 1 28 r yes 1 small 2 2 I 82.3
## 28 1 28 r yes 1 large 4 1 I 85.5
## 29 1 28 r yes 1 large 4 3 I 88.4
## 30 1 28 r yes 1 large 5 1 I 91.4
## # ℹ 2,970 more rows
## # ℹ 30 more variables: duration <dbl>, dB_mean <int>, f0_range <int>,
## # f0_mean <int>, F1 <int>, F2 <int>, F3 <int>, height <dbl>, dataset <dbl>,
## # head_pos <int>, lip_dist <int>, angle <int>, db_mean_z <dbl>,
## # f0_mean_z <dbl>, f0_range_z <dbl>, F1_z <dbl>, F2_z <dbl>, F3_z <dbl>,
## # head_pos_z <dbl>, lip_dist_z <dbl>, angle_z <dbl>, mover <chr>,
## # pos_difference <dbl>, F1_bark <dbl>, F2_bark <dbl>, F3_bark <dbl>, …
Because each participant had a total of 100 trials, these values are also percentages. There is quite some imbalance in the missing values. It’s worth to keep that in mind and definitely allow for individual variance by speaker in the model.
Check means of phonetic variables per speaker:
print(speakers_phon, n = 60)
## # A tibble: 60 Ă— 14
## speaker vowel dB_mean_mean f0_mean_mean f0_range_mean F1_mean F2_mean F3_mean
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 I 89.1 274. 44.0 574. 1752. 2808.
## 2 1 a 88.6 251. 32.9 772. 1611. 2696.
## 3 2 I 80.1 299. 69.3 541. 1895. 2769.
## 4 2 a 78.0 272. 69.2 791. 1449. 2941.
## 5 3 I 76.2 239. 27.4 440. 1973. 3241.
## 6 3 a 71.5 201. 32.3 722. 1660. 2933.
## 7 4 I 74.7 177. 64.8 549. 1884. 2787.
## 8 4 a 73.3 177. 22.4 930. 1365. 2723.
## 9 5 I 76.8 241. 21.4 442. 1850. 2687.
## 10 5 a 76.5 234. 14.3 700. 1230. 2830.
## 11 6 I 78.8 215. 47.8 473. 1850. 2620.
## 12 6 a 76.3 201. 23.1 830. 1372. 2677.
## 13 7 I 87.0 300. 136. 500. 1895. 2633.
## 14 7 a 85.7 333. 55.1 846. 1515. 2719.
## 15 8 I 76.9 234. 35.0 499. 1643. 2759.
## 16 8 a 77.5 226. 24.8 774. 1425. 2599.
## 17 9 I 80.3 265. 36.5 527. 1976. 2907.
## 18 9 a 78.1 250. 36.8 928. 1597. 2858.
## 19 10 I 79.7 244. 30.0 484. 2028. 2948.
## 20 10 a 74.4 236. 19.8 868. 1328. 2892.
## 21 11 I 80.1 213. 25.9 430. 1752. 2560.
## 22 11 a 76.8 192. 29.4 745. 1263. 2615.
## 23 12 I 81.4 204. 56.3 503. 1690. 2719.
## 24 12 a 81.8 196. 41.6 843. 1417. 2846.
## 25 13 I 84.8 248. 35.5 490. 1727. 2624.
## 26 13 a 80.5 241. 35.5 772. 1295. 2133.
## 27 14 I 75.7 270. 18.5 449. 1724. 2602.
## 28 14 a 74.4 253. 17.1 717. 1242. 2703.
## 29 15 I 76.1 262. 35.4 487. 1683. 2592.
## 30 15 a 77.5 223. 22.7 807. 1461. 2717.
## 31 17 I 82.3 236. 17.2 470. 1739. 2774.
## 32 17 a 80.5 231. 10.7 680. 1298. 2762.
## 33 18 I 80.6 209. 30.5 456. 2148. 3133.
## 34 18 a 79.7 186. 26.2 871. 1593. 2750.
## 35 19 I 90.5 257. 34.6 540. 1653. 2761.
## 36 19 a 88.3 239. 38.6 967. 1562. 3057.
## 37 20 I 86.8 251. 42.1 524. 1742. 2716.
## 38 20 a 83.5 228. 40.7 884. 1397. 2789.
## 39 21 I 84.9 264. 28.7 542. 1944. 2875.
## 40 21 a 82.6 245. 22.8 894. 1535. 2869.
## 41 22 I 88.9 261. 26.2 531. 1761. 2718.
## 42 22 a 85.2 231. 21.5 847. 1442. 2727.
## 43 23 I 78.9 207. 21.3 416. 1924. 2779.
## 44 23 a 73.8 158. 28.5 825. 1284. 2632.
## 45 24 I 78.0 224. 24.0 469. 1745. 2617.
## 46 24 a 71.7 183. 31.6 698. 1260. 2645.
## 47 25 I 73.1 230. 43.7 514. 1620. 2508.
## 48 25 a 71.6 199. 36.0 797. 1263. 2707.
## 49 26 I 85.2 201. 37.0 546. 2133. 2809.
## 50 26 a 84.5 187. 15.3 893. 1467. 2762.
## 51 27 I 82.6 229. 16.4 467. 1661. 2667.
## 52 27 a 79.1 210. 18.9 716. 1275. 2857.
## 53 28 I 79.4 249. 30.7 502. 1804. 2654.
## 54 28 a 78.2 240. 25.0 810. 1403. 2666.
## 55 29 I 82.6 235. 17.4 476. 1817. 2517.
## 56 29 a 80.9 211. 14.6 669. 1250. 2639.
## 57 30 I 83.5 275. 19.2 535. 2092. 2832.
## 58 30 a 84.2 244. 9.17 922. 1547. 2847.
## 59 31 I 85.2 254. 27.3 510. 1690. 2659.
## 60 31 a 83.2 244. 47.1 815. 1316. 2780.
## # ℹ 6 more variables: dB_mean_sd <dbl>, f0_mean_sd <dbl>, f0_range_sd <dbl>,
## # F1_sd <dbl>, F2_sd <dbl>, F3_sd <dbl>
Check means of kinematic variables per speaker:
print(speakers_kin, n = 60)
## # A tibble: 60 Ă— 8
## speaker vowel head_pos_mean lip_dist_mean angle_mean head_pos_sd lip_dist_sd
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 I 152. 4.71 46.9 1.99 0.0777
## 2 1 a 152. 5.08 46.9 1.98 0.0929
## 3 2 I 160. 4.07 65.5 0.580 0.0788
## 4 2 a 160. 4.42 65.4 0.623 0.138
## 5 3 I 164. 3.45 55.3 0.766 0.108
## 6 3 a 163. 3.87 55.0 0.732 0.119
## 7 4 I 162. 4.27 58.4 1.60 0.106
## 8 4 a 162. 4.44 59.1 1.55 0.120
## 9 5 I 159. 4.18 64.4 0.531 0.121
## 10 5 a 159. 4.46 64.5 0.541 0.124
## 11 6 I 156. 3.89 46.5 1.17 0.131
## 12 6 a 155. 4.44 46.0 1.08 0.152
## 13 7 I 164. 3.90 48.5 1.65 0.209
## 14 7 a 164. 4.27 49.0 1.65 0.209
## 15 8 I 156. 4.03 52.8 0.880 0.132
## 16 8 a 156. 4.60 52.7 0.760 0.136
## 17 9 I 152. 4.26 49.0 0.822 0.102
## 18 9 a 151. 4.72 48.7 0.937 0.130
## 19 10 I 150. 3.71 58.3 0.761 0.120
## 20 10 a 150. 4.05 58.6 0.705 0.179
## 21 11 I 164. 4.14 61.6 0.959 0.0978
## 22 11 a 163. 4.66 61.7 0.990 0.0913
## 23 12 I 154. 3.76 59.1 0.479 0.0685
## 24 12 a 154. 4.05 59.0 0.417 0.111
## 25 13 I 164. 4.54 55.3 0.399 0.0599
## 26 13 a 164. 5.04 55.6 0.394 0.112
## 27 14 I 163. 3.93 55.1 0.914 0.0819
## 28 14 a 163. 4.34 55.0 0.967 0.164
## 29 15 I 172. 3.99 55.5 1.81 0.0936
## 30 15 a 171. 4.31 55.3 1.65 0.172
## 31 17 I 157. 4.16 43.3 0.810 0.0803
## 32 17 a 157. 4.56 43.1 0.878 0.105
## 33 18 I 155. 3.81 52.1 0.789 0.0563
## 34 18 a 155. 4.11 52.0 0.825 0.0626
## 35 19 I 156. 4.38 60.0 0.740 0.0638
## 36 19 a 156. 4.85 59.7 0.705 0.0997
## 37 20 I 147. 3.83 57.5 1.45 0.0705
## 38 20 a 147. 4.51 57.1 1.64 0.138
## 39 21 I 159. 3.91 56.9 1.95 0.128
## 40 21 a 159. 4.45 57.0 1.66 0.151
## 41 22 I 154. 4.24 47.8 1.65 0.0998
## 42 22 a 154. 4.86 47.3 1.65 0.153
## 43 23 I 151. 4.21 61.6 0.637 0.0800
## 44 23 a 151. 4.59 61.9 0.597 0.165
## 45 24 I 175. 3.74 53.9 1.12 0.0822
## 46 24 a 175. 4.39 53.9 1.25 0.106
## 47 25 I 170. 4.24 48.7 1.24 0.0846
## 48 25 a 169. 4.64 49.1 1.51 0.165
## 49 26 I 162. 3.94 63.0 2.50 0.0871
## 50 26 a 162. 4.26 63.0 2.77 0.103
## 51 27 I 169. 4.39 57.6 0.587 0.106
## 52 27 a 169. 4.99 57.2 0.500 0.174
## 53 28 I 165. 4.64 54.9 1.12 0.128
## 54 28 a 165. 4.99 55.4 1.04 0.162
## 55 29 I 176. 3.62 54.0 0.925 0.0971
## 56 29 a 176. 3.89 53.8 0.987 0.105
## 57 30 I 168. 3.80 60.7 1.24 0.120
## 58 30 a 168. 4.33 60.4 1.18 0.154
## 59 31 I 160. 3.59 49.3 1.49 0.0873
## 60 31 a 160. 3.92 49.6 1.42 0.162
## # ℹ 1 more variable: angle_sd <dbl>
Check age:
summary(can$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 19.00 23.00 26.50 27.77 32.00 48.00
Check height:
summary(can$height)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 152.0 162.0 168.0 167.3 172.0 183.0
Check monolingual:
summary(as.factor(can$monolingual))
## no yes
## 500 2500
# meaning 5 bilingual, 25 monolingual
Check handedness:
summary(as.factor(can$hand))
## l r
## 100 2900
# meaning 1 left, 29 right
Let’s export the csv.
write.csv(can, paste0(data, "can.csv"), row.names = FALSE)
Here, I will start separating the two analyses for clarity purposes. We begin with the first hypothesis that proposes that fundamental frequency is influenced by head position, specifically, that the more upward the head is rotated, the higher the fundamental frequency.
While head position is unidimensional, angle covers movement across three dimensions. We have to choose which of them we will take for the analysis.
First, let’s check their correlation in classical terms.
cor.test(can$angle, can$head_pos)
##
## Pearson's product-moment correlation
##
## data: can$angle and can$head_pos
## t = 7.2609, df = 2990, p-value = 4.88e-13
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.09625132 0.16667973
## sample estimates:
## cor
## 0.1316316
# what will happen if we use z-normalized values
cor.test(can$angle_z, can$head_pos_z)
##
## Pearson's product-moment correlation
##
## data: can$angle_z and can$head_pos_z
## t = 113.96, df = 2990, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8946560 0.9080823
## sample estimates:
## cor
## 0.901586
# the correlation skyrocketed! but it was there anyway
Plotting the raw values…
ggplot(can,
aes(x = head_pos,
y = angle)) +
geom_point(alpha = 0.2) +
geom_smooth(size = 1,
method = lm,
se = F)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 8 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_point()`).
and now the z-scored.
ggplot(can,
aes(x = head_pos_z,
y = angle_z)) +
geom_point(alpha = 0.2) +
geom_smooth(size = 1,
method = lm,
se = F)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 8 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_point()`).
So we cannot use both, but pick one. I am logically tending to pick angle because of larger dimensionality. However, higher dimensionality introduces degrees of freedom.
# Plot for head_pos vs f0_mean_z
f0_head_pos <- ggplot(can,
aes(x = f0_mean_z,
y = head_pos_z)) +
geom_point(alpha = 0.2) +
geom_smooth(size = 1,
method = lm,
se = FALSE) +
labs(title = "Head position vs f0 mean")
# Plot for angle vs f0_mean_z
f0_angle <- ggplot(can,
aes(x = f0_mean_z,
y = angle_z)) +
geom_point(alpha = 0.2) +
geom_smooth(size = 1,
method = lm,
se = FALSE) +
labs(title = "Angle vs f0 mean")
# Arrange plots side by side
grid.arrange(f0_head_pos, f0_angle, ncol = 2)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 136 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 136 rows containing missing values or values outside the scale range
## (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 133 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 133 rows containing missing values or values outside the scale range
## (`geom_point()`).
Visually, they are basically the same. I think it is better to go with head position then, because it captures exactly the movement of interest.
Logically, the vertical position would induce movement. It does not do it always, but if we put two highly correlated factors in the analysis, this may confuse the model and the effects may be diluted.
Therefore, we check for the correlation between head position and vertical position.
cor.test(can$head_pos, can$v_pos)
##
## Pearson's product-moment correlation
##
## data: can$head_pos and can$v_pos
## t = -7.0104, df = 2992, p-value = 2.925e-12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.16220674 -0.09171838
## sample estimates:
## cor
## -0.1271231
# what will happen if we use z-normalized values
cor.test(can$head_pos_z, can$v_pos)
##
## Pearson's product-moment correlation
##
## data: can$head_pos_z and can$v_pos
## t = -82.996, df = 2992, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.8455047 -0.8237896
## sample estimates:
## cor
## -0.8349719
Plotting the raw values…
ggplot(can,
aes(x = v_pos,
y = head_pos)) +
geom_point(alpha = 0.2) +
geom_smooth(size = 1,
method = lm,
se = F)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_point()`).
Given the correlation, we should not put them in the same model. We will have to separate them and see how they perform in the same environment.
Here, I’m computing a model that estimates the probability of an effect of head position (among others) on f0. The first iteration of this analysis was done with the help of Timo Roettger. Since then, however, the analysis has been upgraded with the comments of three anonymous reviewers and Bodo Winter.
We can expect f0 not to be normally distributed. However, if we use some kind of normalization for the outcome variable, like z-scoring, we cannot fit group-level effects by participant (thanks for this insight to Stefano Coretta!). I will use a lognormal family for the model. Therefore, we must think of the priors on the log scale.
We will fit an intercept-only model for comparison, a model without random slopes and intercepts, and a maximal model. First, we inspect the priors we must specify for each model. Then, we will specify the priors separately.
For each model, I will fit weakly informative priors.
get_prior(formula = f0_mean ~ 1,
data = can,
family = lognormal())
## Warning: Rows containing NAs were excluded from the model.
Let’s find a weakly informative prior
# If we assume that the mean for f0 is 200
log(200)
## [1] 5.298317
# If we set the prior to normal(0,1), what would be the boundaries
exp(log(200)-0.5); exp(log(200)+0.5)
## [1] 121.3061
## [1] 329.7443
# If we set the prior to normal(0,2), what would be the boundaries
exp(log(200)-1); exp(log(200)+1)
## [1] 73.57589
## [1] 543.6564
# If we set the prior to normal(0,3), what would be the boundaries
exp(log(200)-1.5); exp(log(200)+1.5)
## [1] 44.62603
## [1] 896.3378
# I think that for (0,3) we can safely assume to respect all datapoints. This will be the prior we choose for the intercept.
Define priors for the intercept-only model
priors_intOnlyH1 <- c(
prior('normal(0, 3)', class = 'Intercept')
)
get_prior(formula = f0_mean ~ 1 + vowel_s + can_size_s + head_pos + h_pos + v_pos + height + # angle and v_pos will be separated
(1 | speaker),
data = can,
family = lognormal())
## Warning: Rows containing NAs were excluded from the model.
We don’t have specific predictions about the magnitude of the specific effects. Most importantly we do not want to constrain any possible effect.
priors_noRandomH1 <- c(
prior('normal(0, 3)', class = 'Intercept'),
prior('normal(0, 1)', class = b)
)
get_prior(formula = f0_mean ~ 1 + vowel_s + can_size_s + head_pos + h_pos + v_pos + height + # head_pos and v_pos will be separated
(1 + vowel_s + can_size_s + head_pos + h_pos + v_pos || speaker),
data = can,
family = lognormal())
## Warning: Rows containing NAs were excluded from the model.
I will not specify any priors for the group-level effects.
priors_maxH1 <- c(
prior('normal(0, 3)', class = 'Intercept'),
prior('normal(0, 1)', class = b)
)
mdl_intOnlyH1 <- brm(f0_mean ~ 1,
data = can,
prior = priors_intOnlyH1,
family = lognormal(),
#backend = "cmdstanr", # depends on you
cores = 4,
chains = 4,
iter = 8000,
warmup = 4000,
seed = 998,
control = list(max_treedepth = 13,
adapt_delta = 0.99),
file = paste0(models, "mdl_intOnlyH1.rds"))
# if we need to compress the model more
#saveRDS(mdl_intOnlyH1, file = paste0(models, "mdl_intOnlyH1.rds"), compress = "xz")
mdl_intOnlyH1 <- readRDS(paste0(models, "mdl_intOnlyH1.rds"))
Let’s check how it looks like.
summary(mdl_intOnlyH1)
## Family: lognormal
## Links: mu = identity; sigma = identity
## Formula: f0_mean ~ 1
## Data: can (Number of observations: 2870)
## Draws: 4 chains, each with iter = 8000; warmup = 4000; thin = 1;
## total post-warmup draws = 16000
##
## Regression Coefficients:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept 5.44 0.00 5.44 5.45 1.00 10001 8431
##
## Further Distributional Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma 0.15 0.00 0.15 0.16 1.00 7430 7709
##
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
plot(mdl_intOnlyH1)
pp_check(mdl_intOnlyH1, ndraws = 100)
The pp_check shows a slight right-skew and a “bump” around 200 Hz in the raw data which the model does not really respect.
Let’s compute the loo for model comparison.
# run loo mdl
if (file.exists(paste0(models, "mdl_intOnlyH1_loo.rds"))) {
mdl_intOnlyH1_loo <- readRDS(paste0(models, "mdl_intOnlyH1_loo.rds"))
} else {
mdl_intOnlyH1_loo <- loo(mdl_intOnlyH1)
saveRDS(mdl_intOnlyH1_loo, paste0(models, "mdl_intOnlyH1_loo.rds"))
}
Check the loo.
mdl_intOnlyH1_loo
##
## Computed from 16000 by 2870 log-likelihood matrix.
##
## Estimate SE
## elpd_loo -14319.8 47.5
## p_loo 2.5 0.2
## looic 28639.7 95.1
## ------
## MCSE of elpd_loo is 0.0.
##
## All Pareto k estimates are good (k < 0.7).
## See help('pareto-k-diagnostic') for details.
mdl_noRandomH1_headPos <- brm(f0_mean ~ 1 + vowel_s + can_size_s +
head_pos + h_pos + height +
(1 | speaker),
data = can,
prior = priors_noRandomH1,
family = lognormal(),
backend = "cmdstanr",
cores = 4,
chains = 4,
iter = 8000,
warmup = 4000,
seed = 997,
save_pars = save_pars(all = TRUE),
control = list(max_treedepth = 13,
adapt_delta = 0.99),
file = paste0(models, "mdl_noRandomH1_headPos.rds"))
# if we need to compress the model more
#saveRDS(mdl_noRandomH1_headPos, file = paste0(models, "mdl_noRandomH1_headPos.rds"), compress = "xz")
mdl_noRandomH1_headPos <- readRDS(paste0(models, "mdl_noRandomH1_headPos.rds"))
Let’s check how it looks like.
summary(mdl_noRandomH1_headPos)
## Family: lognormal
## Links: mu = identity; sigma = identity
## Formula: f0_mean ~ 1 + vowel_s + can_size_s + head_pos + h_pos + height + (1 | speaker)
## Data: can (Number of observations: 2864)
## Draws: 4 chains, each with iter = 8000; warmup = 4000; thin = 1;
## total post-warmup draws = 16000
##
## Multilevel Hyperparameters:
## ~speaker (Number of levels: 30)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.14 0.02 0.11 0.19 1.00 1852 3620
##
## Regression Coefficients:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept 5.59 0.59 4.41 6.74 1.00 2042 2963
## vowel_s -0.08 0.00 -0.09 -0.08 1.00 8656 8489
## can_size_s -0.00 0.00 -0.01 0.00 1.00 8083 7971
## head_pos 0.01 0.00 0.00 0.01 1.00 8803 8446
## h_pos 0.00 0.00 -0.00 0.00 1.00 11993 8761
## height -0.01 0.00 -0.01 0.00 1.00 2245 3464
##
## Further Distributional Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma 0.08 0.00 0.07 0.08 1.00 7622 7693
##
## Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
plot(mdl_noRandomH1_headPos)
conditional_effects(mdl_noRandomH1_headPos, sample_prior = "only")
pp_check(mdl_noRandomH1_headPos, ndraws = 100)
Let’s compute the loo for model comparison.
# run loo mdl
if (file.exists(paste0(models, "mdl_noRandomH1_headPos_loo.rds"))) {
mdl_noRandomH1_headPos_loo <- readRDS(paste0(models, "mdl_noRandomH1_headPos_loo.rds"))
} else {
mdl_noRandomH1_headPos_loo <- loo(mdl_noRandomH1_headPos)
saveRDS(mdl_noRandomH1_headPos_loo, paste0(models, "mdl_noRandomH1_headPos_loo.rds"))
}
Check the loo.
mdl_noRandomH1_headPos_loo
##
## Computed from 16000 by 2864 log-likelihood matrix.
##
## Estimate SE
## elpd_loo -12313.1 81.7
## p_loo 40.7 3.6
## looic 24626.2 163.4
## ------
## MCSE of elpd_loo is 0.1.
##
## All Pareto k estimates are good (k < 0.7).
## See help('pareto-k-diagnostic') for details.
mdl_noRandomH1_vPos <- brm(f0_mean ~ 1 + vowel_s + can_size_s +
h_pos + v_pos + height +
(1 | speaker),
data = can,
prior = priors_noRandomH1,
family = lognormal(),
#backend = "cmdstanr",
cores = 4,
chains = 4,
iter = 8000,
warmup = 4000,
seed = 998,
control = list(max_treedepth = 13,
adapt_delta = 0.99),
file = paste0(models, "mdl_noRandomH1_vPos.rds"))
# if we need to compress the model more
#saveRDS(mdl_noRandomH1_vPos, file = paste0(models, "mdl_noRandomH1_vPos.rds"), compress = "xz")
mdl_noRandomH1_vPos <- readRDS(paste0(models, "mdl_noRandomH1_vPos.rds"))
Let’s check how it looks like.
summary(mdl_noRandomH1_vPos)
## Family: lognormal
## Links: mu = identity; sigma = identity
## Formula: f0_mean ~ 1 + vowel_s + can_size_s + h_pos + v_pos + height + (1 | speaker)
## Data: can (Number of observations: 2870)
## Draws: 4 chains, each with iter = 8000; warmup = 4000; thin = 1;
## total post-warmup draws = 16000
##
## Multilevel Hyperparameters:
## ~speaker (Number of levels: 30)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.14 0.02 0.11 0.19 1.00 1998 2925
##
## Regression Coefficients:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept 5.62 0.61 4.39 6.83 1.00 2204 3069
## vowel_s -0.08 0.00 -0.09 -0.08 1.00 9215 8037
## can_size_s -0.01 0.00 -0.01 0.00 1.00 9097 8006
## h_pos 0.00 0.00 -0.00 0.00 1.00 11931 9679
## v_pos -0.01 0.00 -0.01 -0.00 1.00 11686 10155
## height -0.00 0.00 -0.01 0.01 1.00 2197 3059
##
## Further Distributional Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma 0.08 0.00 0.07 0.08 1.00 7586 7732
##
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
plot(mdl_noRandomH1_vPos)
conditional_effects(mdl_noRandomH1_vPos, sample_prior = "only")
pp_check(mdl_noRandomH1_vPos, ndraws = 100)
Let’s compute the loo for model comparison.
# run loo mdl
if (file.exists(paste0(models, "mdl_noRandomH1_vPos_loo.rds"))) {
mdl_noRandomH1_vPos_loo <- readRDS(paste0(models, "mdl_noRandomH1_vPos_loo.rds"))
} else {
mdl_noRandomH1_vPos_loo <- loo(mdl_noRandomH1_vPos)
saveRDS(mdl_noRandomH1_vPos_loo, paste0(models, "mdl_noRandomH1_vPos_loo.rds"))
}
Check the loo.
mdl_noRandomH1_vPos_loo
##
## Computed from 16000 by 2870 log-likelihood matrix.
##
## Estimate SE
## elpd_loo -12337.9 81.8
## p_loo 40.0 3.5
## looic 24675.8 163.7
## ------
## MCSE of elpd_loo is 0.1.
##
## All Pareto k estimates are good (k < 0.7).
## See help('pareto-k-diagnostic') for details.
mdl_maxH1_headPos <- brm(f0_mean ~ 1 + vowel_s + can_size_s +
head_pos + h_pos + height +
(1 + vowel_s + can_size_s +
head_pos + h_pos || speaker),
data = can,
prior = priors_maxH1,
family = lognormal(),
backend = "cmdstanr",
cores = 4,
chains = 4,
iter = 8000,
warmup = 4000,
seed = 998,
save_pars = save_pars(all = TRUE),
control = list(max_treedepth = 13,
adapt_delta = 0.99),
file = paste0(models, "mdl_maxH1_headPos.rds"))
# if we need to compress the model more
#saveRDS(mdl_maxH1_headPos, file = paste0(models, "mdl_maxH1_headPos.rds"), compress = "xz")
mdl_maxH1_headPos <- readRDS(paste0(models, "mdl_maxH1_headPos.rds"))
beepr::beep()
Let’s check how it looks like.
summary(mdl_maxH1_headPos)
## Family: lognormal
## Links: mu = identity; sigma = identity
## Formula: f0_mean ~ 1 + vowel_s + can_size_s + head_pos + h_pos + height + (1 + vowel_s + can_size_s + head_pos + h_pos || speaker)
## Data: can (Number of observations: 2864)
## Draws: 4 chains, each with iter = 8000; warmup = 4000; thin = 1;
## total post-warmup draws = 16000
##
## Multilevel Hyperparameters:
## ~speaker (Number of levels: 30)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.10 0.04 0.01 0.17 1.00 1294 3518
## sd(vowel_s) 0.07 0.01 0.05 0.09 1.00 3257 6901
## sd(can_size_s) 0.01 0.00 0.00 0.02 1.00 3559 5013
## sd(head_pos) 0.00 0.00 0.00 0.00 1.00 1252 3170
## sd(h_pos) 0.00 0.00 0.00 0.00 1.00 8091 7935
##
## Regression Coefficients:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept 5.66 0.60 4.46 6.86 1.00 9280 10650
## vowel_s -0.08 0.01 -0.11 -0.06 1.00 2081 3937
## can_size_s -0.00 0.00 -0.01 0.00 1.00 18245 11592
## head_pos 0.01 0.00 0.00 0.01 1.00 29390 11045
## h_pos 0.00 0.00 -0.00 0.00 1.00 22058 11441
## height -0.01 0.00 -0.01 0.00 1.00 9411 11100
##
## Further Distributional Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma 0.07 0.00 0.07 0.07 1.00 22067 11520
##
## Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
plot(mdl_maxH1_headPos)
conditional_effects(mdl_maxH1_headPos, sample_prior = "only")
pp_check(mdl_maxH1_headPos, ndraws = 100)
Let’s compute the loo for model comparison.
# run loo mdl
if (file.exists(paste0(models, "mdl_maxH1_headPos_loo.rds"))) {
mdl_maxH1_headPos_loo <- readRDS(paste0(models, "mdl_maxH1_headPos_loo.rds"))
} else {
mdl_maxH1_headPos_loo <- loo(mdl_maxH1_headPos, moment_match = TRUE)
saveRDS(mdl_maxH1_headPos_loo, paste0(models, "mdl_maxH1_headPos_loo.rds"))
}
Check the loo.
mdl_maxH1_headPos_loo
##
## Computed from 16000 by 2864 log-likelihood matrix.
##
## Estimate SE
## elpd_loo -12064.4 89.5
## p_loo 88.1 8.4
## looic 24128.8 178.9
## ------
## MCSE of elpd_loo is NA.
##
## Pareto k diagnostic values:
## Count Pct. Min. ESS
## (-Inf, 0.7] (good) 2862 99.9% 235
## (0.7, 1] (bad) 2 0.1% <NA>
## (1, Inf) (very bad) 0 0.0% <NA>
## See help('pareto-k-diagnostic') for details.
See if any reloo is needed.
if (file.exists(paste0(models, "mdl_maxH1_headPos_reloo.rds"))) {
mdl_maxH1_headPos_reloo <- readRDS(paste0(models, "mdl_maxH1_headPos_reloo.rds"))
} else {
mdl_maxH1_headPos_reloo <- reloo(mdl_maxH1_headPos_loo, mdl_maxH1_headPos, chains = 1)
saveRDS(mdl_maxH1_headPos_reloo, paste0(models, "mdl_maxH1_headPos_reloo.rds"))
}
mdl_maxH1_headPos_reloo
##
## Computed from 16000 by 2864 log-likelihood matrix.
##
## Estimate SE
## elpd_loo -12064.1 89.3
## p_loo 87.8 8.3
## looic 24128.2 178.7
## ------
## MCSE of elpd_loo is 0.2.
##
## All Pareto k estimates are good (k < 0.7).
## See help('pareto-k-diagnostic') for details.
mdl_maxH1_vPos <- brm(f0_mean ~ 1 + vowel_s + can_size_s +
h_pos + v_pos + height +
(1 + vowel_s + can_size_s +
h_pos + v_pos || speaker),
data = can,
prior = priors_maxH1,
family = lognormal(),
#backend = "cmdstanr",
cores = 4,
chains = 4,
iter = 8000,
warmup = 4000,
seed = 998,
save_pars = save_pars(all = TRUE),
control = list(max_treedepth = 13,
adapt_delta = 0.99),
file = paste0(models, "mdl_maxH1_vPos.rds"))
# if we need to compress the model more
#saveRDS(mdl_maxH1_vPos, file = paste0(models, "mdl_maxH1_vPos.rds"), compress = "xz")
mdl_maxH1_vPos <- readRDS(paste0(models, "mdl_maxH1_vPos.rds"))
beepr::beep()
Let’s check how it looks like.
summary(mdl_maxH1_vPos)
## Family: lognormal
## Links: mu = identity; sigma = identity
## Formula: f0_mean ~ 1 + vowel_s + can_size_s + h_pos + v_pos + height + (1 + vowel_s + can_size_s + h_pos + v_pos || speaker)
## Data: can (Number of observations: 2870)
## Draws: 4 chains, each with iter = 8000; warmup = 4000; thin = 1;
## total post-warmup draws = 16000
##
## Multilevel Hyperparameters:
## ~speaker (Number of levels: 30)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.14 0.02 0.11 0.18 1.00 3003 5770
## sd(vowel_s) 0.07 0.01 0.05 0.09 1.00 3082 5962
## sd(can_size_s) 0.01 0.00 0.00 0.02 1.00 4119 4324
## sd(h_pos) 0.00 0.00 0.00 0.00 1.00 7523 6961
## sd(v_pos) 0.01 0.00 0.00 0.01 1.00 5158 5225
##
## Regression Coefficients:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept 5.58 0.58 4.44 6.71 1.00 2056 3114
## vowel_s -0.08 0.01 -0.11 -0.06 1.00 1527 2867
## can_size_s -0.00 0.00 -0.01 0.00 1.00 15197 12190
## h_pos 0.00 0.00 -0.00 0.00 1.00 23109 12894
## v_pos -0.01 0.00 -0.01 -0.00 1.00 10427 11140
## height -0.00 0.00 -0.01 0.01 1.00 2055 3074
##
## Further Distributional Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma 0.07 0.00 0.07 0.07 1.00 18328 12619
##
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
plot(mdl_maxH1_vPos)
conditional_effects(mdl_maxH1_vPos, sample_prior = "only")
pp_check(mdl_maxH1_vPos, ndraws = 100)
Let’s compute the loo for model comparison.
# run loo mdl
if (file.exists(paste0(models, "mdl_maxH1_vPos_loo.rds"))) {
mdl_maxH1_vPos_loo <- readRDS(paste0(models, "mdl_maxH1_vPos_loo.rds"))
} else {
mdl_maxH1_vPos_loo <- loo(mdl_maxH1_vPos, moment_match = TRUE)
saveRDS(mdl_maxH1_vPos_loo, paste0(models, "mdl_maxH1_vPos_loo.rds"))
}
Check the loo.
mdl_maxH1_vPos_loo
##
## Computed from 16000 by 2870 log-likelihood matrix.
##
## Estimate SE
## elpd_loo -12082.1 90.9
## p_loo 104.9 9.3
## looic 24164.2 181.9
## ------
## MCSE of elpd_loo is 0.1.
##
## All Pareto k estimates are good (k < 0.7).
## See help('pareto-k-diagnostic') for details.
See if any reloo is needed.
if (file.exists(paste0(models, "mdl_maxH1_vPos_reloo.rds"))) {
mdl_maxH1_vPos_reloo <- readRDS(paste0(models, "mdl_maxH1_vPos_reloo.rds"))
} else {
mdl_maxH1_vPos_reloo <- reloo(mdl_maxH1_vPos_loo, mdl_maxH1_vPos, chains = 1)
saveRDS(mdl_maxH1_vPos_reloo, paste0(models, "mdl_maxH1_vPos_reloo.rds"))
}
mdl_maxH1_vPos_reloo
##
## Computed from 16000 by 2870 log-likelihood matrix.
##
## Estimate SE
## elpd_loo -12082.1 90.9
## p_loo 104.9 9.3
## looic 24164.2 181.9
## ------
## MCSE of elpd_loo is 0.1.
##
## All Pareto k estimates are good (k < 0.7).
## See help('pareto-k-diagnostic') for details.
We computed a set of models from integer only, to a maximal model. We will compare them now, but still adhereing to head position vs. vertical position separately.
compH1_headPos <- loo_compare(mdl_noRandomH1_headPos_loo,mdl_maxH1_headPos_reloo)
compH1_headPos
## elpd_diff se_diff
## mdl_maxH1_headPos 0.0 0.0
## mdl_noRandomH1_headPos -249.0 29.9
As expected, the maximal model is the best estimate. We will proceed with extracting the effects from this model.
compH1_vPos <- loo_compare(mdl_intOnlyH1_loo,mdl_noRandomH1_vPos_loo,mdl_maxH1_vPos_reloo)
compH1_vPos
## elpd_diff se_diff
## mdl_maxH1_vPos 0.0 0.0
## mdl_noRandomH1_vPos -255.8 31.1
## mdl_intOnlyH1 -2237.8 78.4
As expected, the maximal model is the best estimate. We will proceed with extracting the effects from this model.
We found that the best-fit models are the maximal models. Now we will extract the predictions from the models.
We would expect:
First, let’s look at the summary again.
summary(mdl_maxH1_headPos)
## Family: lognormal
## Links: mu = identity; sigma = identity
## Formula: f0_mean ~ 1 + vowel_s + can_size_s + head_pos + h_pos + height + (1 + vowel_s + can_size_s + head_pos + h_pos || speaker)
## Data: can (Number of observations: 2864)
## Draws: 4 chains, each with iter = 8000; warmup = 4000; thin = 1;
## total post-warmup draws = 16000
##
## Multilevel Hyperparameters:
## ~speaker (Number of levels: 30)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.10 0.04 0.01 0.17 1.00 1294 3518
## sd(vowel_s) 0.07 0.01 0.05 0.09 1.00 3257 6901
## sd(can_size_s) 0.01 0.00 0.00 0.02 1.00 3559 5013
## sd(head_pos) 0.00 0.00 0.00 0.00 1.00 1252 3170
## sd(h_pos) 0.00 0.00 0.00 0.00 1.00 8091 7935
##
## Regression Coefficients:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept 5.66 0.60 4.46 6.86 1.00 9280 10650
## vowel_s -0.08 0.01 -0.11 -0.06 1.00 2081 3937
## can_size_s -0.00 0.00 -0.01 0.00 1.00 18245 11592
## head_pos 0.01 0.00 0.00 0.01 1.00 29390 11045
## h_pos 0.00 0.00 -0.00 0.00 1.00 22058 11441
## height -0.01 0.00 -0.01 0.00 1.00 9411 11100
##
## Further Distributional Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma 0.07 0.00 0.07 0.07 1.00 22067 11520
##
## Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
Now let’s check the hypotheses using brms function.
hypothesis(mdl_maxH1_headPos, "vowel_s < 0")
## Hypothesis Tests for class b:
## Hypothesis Estimate Est.Error CI.Lower CI.Upper Evid.Ratio Post.Prob Star
## 1 (vowel_s) < 0 -0.08 0.01 -0.1 -0.06 Inf 1 *
## ---
## 'CI': 90%-CI for one-sided and 95%-CI for two-sided hypotheses.
## '*': For one-sided hypotheses, the posterior probability exceeds 95%;
## for two-sided hypotheses, the value tested against lies outside the 95%-CI.
## Posterior probabilities of point hypotheses assume equal prior probabilities.
plot(hypothesis(mdl_maxH1_headPos, "vowel_s < 0"))
The assumption holds for vowel: [i] has a higher means f0.
hypothesis(mdl_maxH1_headPos, "head_pos > 0")
## Hypothesis Tests for class b:
## Hypothesis Estimate Est.Error CI.Lower CI.Upper Evid.Ratio Post.Prob Star
## 1 (head_pos) > 0 0.01 0 0 0.01 Inf 1 *
## ---
## 'CI': 90%-CI for one-sided and 95%-CI for two-sided hypotheses.
## '*': For one-sided hypotheses, the posterior probability exceeds 95%;
## for two-sided hypotheses, the value tested against lies outside the 95%-CI.
## Posterior probabilities of point hypotheses assume equal prior probabilities.
plot(hypothesis(mdl_maxH1_headPos, "head_pos > 0"))
The assumption holds for head_pos: upward head movement induces an increas in f0.
hypothesis(mdl_maxH1_headPos, "height < 0")
## Hypothesis Tests for class b:
## Hypothesis Estimate Est.Error CI.Lower CI.Upper Evid.Ratio Post.Prob Star
## 1 (height) < 0 -0.01 0 -0.01 0 26.21 0.96 *
## ---
## 'CI': 90%-CI for one-sided and 95%-CI for two-sided hypotheses.
## '*': For one-sided hypotheses, the posterior probability exceeds 95%;
## for two-sided hypotheses, the value tested against lies outside the 95%-CI.
## Posterior probabilities of point hypotheses assume equal prior probabilities.
plot(hypothesis(mdl_maxH1_headPos, "height < 0"))
The assumption does hold for height in our data: taller participants tend to have lower f0 means in vowel intervals.
hypothesis(mdl_maxH1_headPos, "can_size_s < 0")
## Hypothesis Tests for class b:
## Hypothesis Estimate Est.Error CI.Lower CI.Upper Evid.Ratio Post.Prob
## 1 (can_size_s) < 0 0 0 -0.01 0 13.39 0.93
## Star
## 1
## ---
## 'CI': 90%-CI for one-sided and 95%-CI for two-sided hypotheses.
## '*': For one-sided hypotheses, the posterior probability exceeds 95%;
## for two-sided hypotheses, the value tested against lies outside the 95%-CI.
## Posterior probabilities of point hypotheses assume equal prior probabilities.
plot(hypothesis(mdl_maxH1_headPos, "can_size_s < 0"))
The assumption does not hold for can_size in our data: larger cans do not generally induce higher f0 means in vowel intervals.
hypothesis(mdl_maxH1_headPos, "h_pos > 0")
## Hypothesis Tests for class b:
## Hypothesis Estimate Est.Error CI.Lower CI.Upper Evid.Ratio Post.Prob Star
## 1 (h_pos) > 0 0 0 0 0 1.93 0.66
## ---
## 'CI': 90%-CI for one-sided and 95%-CI for two-sided hypotheses.
## '*': For one-sided hypotheses, the posterior probability exceeds 95%;
## for two-sided hypotheses, the value tested against lies outside the 95%-CI.
## Posterior probabilities of point hypotheses assume equal prior probabilities.
plot(hypothesis(mdl_maxH1_headPos, "h_pos > 0"))
The assumption does not hold for h_pos in our data: an increase in the horizontal position from left to right does not generally induce higher f0 means in vowel intervals.
That is generally super exciting and means that our hypothesis checks out! Head position has an effect on the f0 mean.
Change to report:
summary(can$head_pos) # the unit is centimeters
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 141.8 154.9 160.0 160.4 164.5 178.4 6
exp(fixef(mdl_maxH1_headPos))
## Estimate Est.Error Q2.5 Q97.5
## Intercept 286.4558241 1.821411 86.7857554 955.1441295
## vowel_s 0.9199808 1.012618 0.8977658 0.9436138
## can_size_s 0.9954085 1.003142 0.9892045 1.0015342
## head_pos 1.0056963 1.001072 1.0035660 1.0078033
## h_pos 1.0003938 1.000969 0.9984914 1.0022929
## height 0.9932596 1.003727 0.9858748 1.0005496
# This is giving the original scale for the intercept but odds for coefficients
exp(fixef(mdl_maxH1_headPos)[1,1]) # intercept in Hz
## [1] 286.4558
(exp(fixef(mdl_maxH1_headPos)[1,1])*exp(fixef(mdl_maxH1_headPos)[4,1]))-exp(fixef(mdl_maxH1_headPos)[1,1]) # estimate head_pos in Hz (positive)
## [1] 1.631748
(exp(fixef(mdl_maxH1_headPos)[1,1])*exp(fixef(mdl_maxH1_headPos)[2,1]))-exp(fixef(mdl_maxH1_headPos)[1,1]) # estimate vowel_s in Hz (negative)
## [1] -22.92196
(exp(fixef(mdl_maxH1_headPos)[1,1])*exp(fixef(mdl_maxH1_headPos)[6,1]))-exp(fixef(mdl_maxH1_headPos)[1,1]) # estimate height in Hz (positive)
## [1] -1.930828
All hypotheses tested together:
hypothesis(mdl_maxH1_headPos, c("vowel_s < 0", "head_pos > 0", "height < 0", "can_size_s < 0", "h_pos > 0"))
## Hypothesis Tests for class b:
## Hypothesis Estimate Est.Error CI.Lower CI.Upper Evid.Ratio Post.Prob
## 1 (vowel_s) < 0 -0.08 0.01 -0.10 -0.06 Inf 1.00
## 2 (head_pos) > 0 0.01 0.00 0.00 0.01 Inf 1.00
## 3 (height) < 0 -0.01 0.00 -0.01 0.00 26.21 0.96
## 4 (can_size_s) < 0 0.00 0.00 -0.01 0.00 13.39 0.93
## 5 (h_pos) > 0 0.00 0.00 0.00 0.00 1.93 0.66
## Star
## 1 *
## 2 *
## 3 *
## 4
## 5
## ---
## 'CI': 90%-CI for one-sided and 95%-CI for two-sided hypotheses.
## '*': For one-sided hypotheses, the posterior probability exceeds 95%;
## for two-sided hypotheses, the value tested against lies outside the 95%-CI.
## Posterior probabilities of point hypotheses assume equal prior probabilities.
plot(hypothesis(mdl_maxH1_headPos, c("vowel_s < 0", "head_pos > 0", "height < 0", "can_size_s < 0", "h_pos > 0")))
We expect the other effects to be the same, except for:
First, let’s look at the summary again.
summary(mdl_maxH1_vPos)
## Family: lognormal
## Links: mu = identity; sigma = identity
## Formula: f0_mean ~ 1 + vowel_s + can_size_s + h_pos + v_pos + height + (1 + vowel_s + can_size_s + h_pos + v_pos || speaker)
## Data: can (Number of observations: 2870)
## Draws: 4 chains, each with iter = 8000; warmup = 4000; thin = 1;
## total post-warmup draws = 16000
##
## Multilevel Hyperparameters:
## ~speaker (Number of levels: 30)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.14 0.02 0.11 0.18 1.00 3003 5770
## sd(vowel_s) 0.07 0.01 0.05 0.09 1.00 3082 5962
## sd(can_size_s) 0.01 0.00 0.00 0.02 1.00 4119 4324
## sd(h_pos) 0.00 0.00 0.00 0.00 1.00 7523 6961
## sd(v_pos) 0.01 0.00 0.00 0.01 1.00 5158 5225
##
## Regression Coefficients:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept 5.58 0.58 4.44 6.71 1.00 2056 3114
## vowel_s -0.08 0.01 -0.11 -0.06 1.00 1527 2867
## can_size_s -0.00 0.00 -0.01 0.00 1.00 15197 12190
## h_pos 0.00 0.00 -0.00 0.00 1.00 23109 12894
## v_pos -0.01 0.00 -0.01 -0.00 1.00 10427 11140
## height -0.00 0.00 -0.01 0.01 1.00 2055 3074
##
## Further Distributional Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma 0.07 0.00 0.07 0.07 1.00 18328 12619
##
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
Now let’s check the hypotheses using brms function.
hypothesis(mdl_maxH1_vPos, "vowel_s < 0")
## Hypothesis Tests for class b:
## Hypothesis Estimate Est.Error CI.Lower CI.Upper Evid.Ratio Post.Prob Star
## 1 (vowel_s) < 0 -0.08 0.01 -0.1 -0.06 Inf 1 *
## ---
## 'CI': 90%-CI for one-sided and 95%-CI for two-sided hypotheses.
## '*': For one-sided hypotheses, the posterior probability exceeds 95%;
## for two-sided hypotheses, the value tested against lies outside the 95%-CI.
## Posterior probabilities of point hypotheses assume equal prior probabilities.
plot(hypothesis(mdl_maxH1_vPos, "vowel_s < 0"))
The assumption holds for vowel: [i] has a higher means f0.
hypothesis(mdl_maxH1_vPos, "v_pos < 0")
## Hypothesis Tests for class b:
## Hypothesis Estimate Est.Error CI.Lower CI.Upper Evid.Ratio Post.Prob Star
## 1 (v_pos) < 0 -0.01 0 -0.01 0 3199 1 *
## ---
## 'CI': 90%-CI for one-sided and 95%-CI for two-sided hypotheses.
## '*': For one-sided hypotheses, the posterior probability exceeds 95%;
## for two-sided hypotheses, the value tested against lies outside the 95%-CI.
## Posterior probabilities of point hypotheses assume equal prior probabilities.
plot(hypothesis(mdl_maxH1_vPos, "v_pos < 0"))
The assumption holds for head_pos: upward head movement induces an increas in f0.
hypothesis(mdl_maxH1_vPos, "height < 0")
## Hypothesis Tests for class b:
## Hypothesis Estimate Est.Error CI.Lower CI.Upper Evid.Ratio Post.Prob Star
## 1 (height) < 0 0 0 -0.01 0 1.47 0.59
## ---
## 'CI': 90%-CI for one-sided and 95%-CI for two-sided hypotheses.
## '*': For one-sided hypotheses, the posterior probability exceeds 95%;
## for two-sided hypotheses, the value tested against lies outside the 95%-CI.
## Posterior probabilities of point hypotheses assume equal prior probabilities.
plot(hypothesis(mdl_maxH1_vPos, "height < 0"))
The assumption does not hold for height in our data: higher participants do not generally have higher f0 means in vowel intervals.
hypothesis(mdl_maxH1_vPos, "can_size_s < 0")
## Hypothesis Tests for class b:
## Hypothesis Estimate Est.Error CI.Lower CI.Upper Evid.Ratio Post.Prob
## 1 (can_size_s) < 0 0 0 -0.01 0 16.26 0.94
## Star
## 1
## ---
## 'CI': 90%-CI for one-sided and 95%-CI for two-sided hypotheses.
## '*': For one-sided hypotheses, the posterior probability exceeds 95%;
## for two-sided hypotheses, the value tested against lies outside the 95%-CI.
## Posterior probabilities of point hypotheses assume equal prior probabilities.
plot(hypothesis(mdl_maxH1_vPos, "can_size_s < 0"))
The assumption does not hold for can_size in our data, but it is by a thread!: larger cans do not generally induce higher f0 means in vowel intervals.
hypothesis(mdl_maxH1_vPos, "h_pos > 0")
## Hypothesis Tests for class b:
## Hypothesis Estimate Est.Error CI.Lower CI.Upper Evid.Ratio Post.Prob Star
## 1 (h_pos) > 0 0 0 0 0 2.07 0.67
## ---
## 'CI': 90%-CI for one-sided and 95%-CI for two-sided hypotheses.
## '*': For one-sided hypotheses, the posterior probability exceeds 95%;
## for two-sided hypotheses, the value tested against lies outside the 95%-CI.
## Posterior probabilities of point hypotheses assume equal prior probabilities.
plot(hypothesis(mdl_maxH1_vPos, "h_pos > 0"))
The assumption does not hold for h_pos in our data: an increase in the horizontal position from left to right does not generally induce higher f0 means in vowel intervals.
Therefore, we also see an effect of vertical position on f0 mean.
Change to report:
summary(can$v_pos)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 2 3 3 4 5
exp(fixef(mdl_maxH1_vPos))
## Estimate Est.Error Q2.5 Q97.5
## Intercept 265.0350648 1.781365 84.9826617 820.1098809
## vowel_s 0.9197430 1.012882 0.8973541 0.9435620
## can_size_s 0.9951932 1.003087 0.9891198 1.0012171
## h_pos 1.0004336 1.000956 0.9985099 1.0023140
## v_pos 0.9948101 1.001390 0.9920574 0.9975411
## height 0.9992403 1.003455 0.9925357 1.0060652
exp(fixef(mdl_maxH1_vPos)[1,1]) # intercept in Hz
## [1] 265.0351
(exp(fixef(mdl_maxH1_vPos)[1,1])*exp(fixef(mdl_maxH1_vPos)[4,1]))-exp(fixef(mdl_maxH1_vPos)[1,1]) # estimate v_pos in Hz (positive)
## [1] 0.1149117
(exp(fixef(mdl_maxH1_vPos)[1,1])*exp(fixef(mdl_maxH1_vPos)[2,1]))-exp(fixef(mdl_maxH1_vPos)[1,1]) # estimate vowel_s in Hz (negative)
## [1] -21.27091
All hypotheses tested together:
hypothesis(mdl_maxH1_vPos, c("vowel_s < 0", "v_pos < 0", "height < 0", "can_size_s < 0", "h_pos > 0"))
## Hypothesis Tests for class b:
## Hypothesis Estimate Est.Error CI.Lower CI.Upper Evid.Ratio Post.Prob
## 1 (vowel_s) < 0 -0.08 0.01 -0.10 -0.06 Inf 1.00
## 2 (v_pos) < 0 -0.01 0.00 -0.01 0.00 3199.00 1.00
## 3 (height) < 0 0.00 0.00 -0.01 0.00 1.47 0.59
## 4 (can_size_s) < 0 0.00 0.00 -0.01 0.00 16.26 0.94
## 5 (h_pos) > 0 0.00 0.00 0.00 0.00 2.07 0.67
## Star
## 1 *
## 2 *
## 3
## 4
## 5
## ---
## 'CI': 90%-CI for one-sided and 95%-CI for two-sided hypotheses.
## '*': For one-sided hypotheses, the posterior probability exceeds 95%;
## for two-sided hypotheses, the value tested against lies outside the 95%-CI.
## Posterior probabilities of point hypotheses assume equal prior probabilities.
plot(hypothesis(mdl_maxH1_vPos, c("vowel_s < 0", "v_pos < 0", "height < 0", "can_size_s < 0", "h_pos > 0")))
Taken together, we see that in both models, vowel has a reliable effect, which is our logical control. Importantly, our hypothesis holds, we find a reliable effect of head position on f0, whereby each 1 cm change in the predictor induces 1 Hz change in the outcome variable. This really is something, given the range of head movement within speaker.
can %>%
group_by(speaker) %>%
dplyr::summarize(head_range = max(head_pos, na.rm = TRUE) - min(head_pos, na.rm = TRUE)) %>%
ungroup() %>%
arrange(desc(head_range)) %>%
{. ->> head_range} %>%
print(n = 30)
## # A tibble: 30 Ă— 2
## speaker head_range
## <dbl> <dbl>
## 1 26 10.5
## 2 1 8.81
## 3 21 8.27
## 4 15 7.66
## 5 20 7.26
## 6 31 6.66
## 7 7 6.59
## 8 22 6.48
## 9 4 5.98
## 10 25 4.83
## 11 30 4.80
## 12 11 4.45
## 13 28 4.35
## 14 24 4.34
## 15 14 4.26
## 16 6 4.12
## 17 29 4.05
## 18 10 3.71
## 19 9 3.52
## 20 17 3.47
## 21 8 3.45
## 22 18 3.27
## 23 19 3.24
## 24 2 3.11
## 25 27 2.92
## 26 3 2.92
## 27 23 2.62
## 28 12 2.44
## 29 5 2.37
## 30 13 1.88
Given the various head movement behavior, we can also expect big inter-speaker variance.
# brmstools::forest(mdl_maxH1_headPos,
# grouping = "speaker",
# pars = "head_pos",
# digits = 0,
# sort = T)
# See new way:
# https://github.com/mvuorre/brmstools?tab=readme-ov-file#forest-plots
The effect is present across all speakers. It would be nice to see how this relates to being mover or non-mover.
p_hPos_mover <- mdl_maxH1_headPos %>%
spread_draws(b_Intercept, b_head_pos, r_speaker[speaker,term]) %>%
filter(term == "head_pos") %>%
mutate(mu = b_head_pos + r_speaker) %>%
ungroup() %>%
mutate(speaker = str_replace_all(speaker, "[.]", " ")) %>%
mutate(speaker = as.double(speaker)) %>%
left_join(can %>% select(speaker, mover), by = "speaker") %>%
ggplot(aes(x = mu, y = reorder(speaker, mu), fill = mover)) +
geom_halfeyeh(.width = .5, size = 2/3, alpha = 0.5) + # Adjusted alpha for fill color
scale_fill_manual(values = colorBlindBlack8) +
geom_vline(xintercept = 0, linetype = "dashed", color = "grey") + # Grey dashed line at 0
labs(x = "Posterior distribution (Head position)", y = "Speaker", fill = "Mover") +
theme_minimal()
## Warning in left_join(., can %>% select(speaker, mover), by = "speaker"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
## Warning in geom_halfeyeh(.width = 0.5, size = 2/3, alpha = 0.5): 'geom_halfeyeh' ist veraltet.
## Benutzen Sie stattdessen 'stat_halfeye'
## Siehe help("Deprecated") und help("tidybayes-deprecated").
print(p_hPos_mover +
theme(axis.text = element_text(size = 12),
axis.title = element_text(size = 14,
face = "bold"),
legend.title = element_text(size = 14),
legend.text = element_text(size = 12)))
# ggsave(file = paste0(plots, "p_hPos_mover.pdf"), plot = last_plot(), width = 8, height = 10)
# this is very memory-consuming, so run this to kind of "reset"
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 5787421 309.1 10013642 534.8 10013642 534.8
## Vcells 437508851 3338.0 1344889227 10260.7 1344831992 10260.3
This shows that being a mover has nothing to do with the effect. Person 7 shows the strongest effect and she is a mover, but person 2 is not a mover and her effect is second. So the effect is equally strong in people, who move less.
We had to separate head position and vertical position due to their correlation and calculate separate models. The effect of vertical position that we found is reliable, but translated to real-world terms, much weaker. For 1 step change in vertical position, we find roughly 1 Hz change. But since our plane only comprised of 5 steps, which were projected on the surface of 1,30 meters, this is much weaker an effect than we exposed for head position.
Most likely, the effect we find for vertical position is the one induced through head movement –HOW to show?
Having raw and posterior values, we can show the effects visually.
Scatter plot with regression lines: f0 mean x head position
By vowel, z-normalized:
f0_head_pos_vow_z <- ggplot(can,
aes(x = head_pos_z,
y = f0_mean_z,
color = factor(vowel))) +
geom_point(alpha = 0.4) +
stat_smooth(size = 1,
method = "lm",
se = TRUE) +
labs(color = "Vowel", x = "Head Position (z-normalized)", y = "f0 Mean (z-normalized)") +
theme_minimal() +
scale_color_manual(values = colorBlindBlack8)
print(f0_head_pos_vow_z + theme_bw() +
theme(axis.text = element_text(size = 12),
axis.title = element_text(size = 14,
face = "bold"),
legend.title = element_text(size = 14),
legend.text = element_text(size = 12)))
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 136 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 136 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggsave(filename = paste0(plots, "f0_head_pos_vow_z.pdf"),
plot = last_plot(),
width = 8, height = 5)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 136 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Removed 136 rows containing missing values or values outside the scale range
## (`geom_point()`).
By vowels, raw values:
f0_head_pos_vow_raw <- ggplot(can,
aes(x = head_pos,
y = f0_mean,
color = factor(vowel))) +
geom_point(alpha = 0.4) +
stat_smooth(size = 1,
method = "lm",
se = TRUE) +
labs(color = "Vowel", x = "Head Position (cm)", y = "f0 Mean (Hz)") +
theme_minimal() +
scale_color_manual(values = colorBlindBlack8)
print(f0_head_pos_vow_raw + theme_bw() +
theme(axis.text = element_text(size = 12),
axis.title = element_text(size = 14,
face = "bold"),
legend.title = element_text(size = 14),
legend.text = element_text(size = 12)))
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 136 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 136 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggsave(filename = paste0(plots, "f0_head_pos_vow_raw.pdf"),
plot = last_plot(),
width = 8, height = 5)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 136 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Removed 136 rows containing missing values or values outside the scale range
## (`geom_point()`).
By size, z:
f0_head_pos_size_z <- ggplot(can,
aes(x = head_pos_z,
y = f0_mean_z,
color = can_size)) +
geom_point(alpha = 0.4) +
geom_smooth(size = 1,
method = lm,
se = T) +
scale_color_manual(values=colorBlindBlack8) +
labs(x = "Head position (z-normalized)",
y = "f0 mean (z-normalized)",
color = "Can size")
print(f0_head_pos_size_z + theme_bw() +
theme(axis.text = element_text(size = 12),
axis.title = element_text(size = 14,
face = "bold"),
legend.title = element_text(size = 14),
legend.text = element_text(size = 12)))
# ggsave(filename = paste0(plots, "f0_head_pos_size_z.pdf"),
# plot = last_plot(),
# width = 8, height = 5)
By size, raw:
f0_head_pos_size_raw <- ggplot(can,
aes(x = head_pos,
y = f0_mean,
color = can_size)) +
geom_point(alpha = 0.4) +
geom_smooth(size = 1,
method = lm,
se = T) +
scale_color_manual(values=colorBlindBlack8) +
labs(x = "Head position (cm)",
y = "f0 mean (Hz)",
color = "Can size")
print(f0_head_pos_size_raw + theme_bw() +
theme(axis.text = element_text(size = 12),
axis.title = element_text(size = 14,
face = "bold"),
legend.title = element_text(size = 14),
legend.text = element_text(size = 12)))
# ggsave(filename = paste0(plots, "f0_head_pos_size_raw.pdf"),
# plot = last_plot(),
# width = 8, height = 5)
By mover, z:
f0_head_pos_mover_z <- ggplot(can,
aes(x = head_pos_z,
y = f0_mean_z,
color = mover)) +
geom_point(alpha = 0.4) +
geom_smooth(size = 1,
method = lm,
se = T) +
scale_color_manual(values=colorBlindBlack8) +
labs(x = "Head position (z-normalized)",
y = "f0 mean (z-normalized)",
color = "Mover")
print(f0_head_pos_mover_z + theme_bw() +
theme(axis.text = element_text(size = 12),
axis.title = element_text(size = 14,
face = "bold"),
legend.title = element_text(size = 14),
legend.text = element_text(size = 12)))
# ggsave(filename = paste0(plots, "f0_head_pos_mover_z.pdf"),
# plot = last_plot(),
# width = 8, height = 5)
By mover, raw:
f0_head_pos_mover_raw <- ggplot(can,
aes(x = head_pos,
y = f0_mean,
color = mover)) +
geom_point(alpha = 0.4) +
geom_smooth(size = 1,
method = lm,
se = T) +
scale_color_manual(values=colorBlindBlack8) +
labs(x = "Head position (cm)",
y = "f0 mean (Hz)",
color = "Mover")
print(f0_head_pos_mover_raw + theme_bw() +
theme(axis.text = element_text(size = 12),
axis.title = element_text(size = 14,
face = "bold"),
legend.title = element_text(size = 14),
legend.text = element_text(size = 12)))
# ggsave(filename = paste0(plots, "f0_head_pos_mover_raw.pdf"),
# plot = last_plot(),
# width = 8, height = 5)
Z-normalized:
f0_vowel_size_z <- ggplot(can,
aes(y = f0_mean_z,
x = vowel,
color = can_size)) +
stat_summary(position = position_dodge(width = 0.5),
size = 0.75) +
scale_color_manual(values = colorBlindBlack8)+
labs(x = "Vowel",
y = "f0 mean (z-normalized)",
color = "Can size")
print(f0_vowel_size_z +
theme_bw()+
theme(axis.text = element_text(size = 12),
axis.title = element_text(size = 14,
face = "bold"),
legend.title = element_text(size = 14),
legend.text = element_text(size = 12)))
## Warning: Removed 130 rows containing non-finite outside the scale range
## (`stat_summary()`).
## No summary function supplied, defaulting to `mean_se()`
# ggsave(filename = paste0(plots, "f0_vowel_size_z.pdf"),
# plot = last_plot(),
# width = 8, height = 5)
Raw values:
f0_vowel_size_raw <- ggplot(can,
aes(y = f0_mean,
x = vowel,
color = can_size)) +
stat_summary(position = position_dodge(width = 0.5),
size = 0.75) +
scale_color_manual(values = colorBlindBlack8)+
labs(x = "Vowel",
y = "f0 mean (Hz)",
color = "Can size")
print(f0_vowel_size_raw +
theme_bw() +
theme(axis.text = element_text(size = 12),
axis.title = element_text(size = 14,
face = "bold"),
legend.title = element_text(size = 14),
legend.text = element_text(size = 12)))
## Warning: Removed 130 rows containing non-finite outside the scale range
## (`stat_summary()`).
## No summary function supplied, defaulting to `mean_se()`
# ggsave(filename = paste0(plots, "f0_vowel_size_raw.pdf"),
# plot = last_plot(),
# width = 8, height = 5)
The intrisic F0 difference between a and I is clearly visible with a having much lower F0 than I. Apart from that, we can see the effect of can size in both a and I, with large can yielding lower F0 than a small can.
Now let’s see how this relationship looks like in all possible vertical and horizontal positions. I’m creating a 5x5 grid
print(f0_vowel_size_raw +
facet_grid(v_pos~h_pos) +
theme_bw() +
theme(axis.text = element_text(size = 12),
axis.title = element_text(size = 14,
face = "bold"),
legend.title = element_text(size = 14),
legend.text = element_text(size = 12)))
# ggsave(filename = paste0(plots, "f0_vowel_size_raw_grid.pdf"),
# plot = last_plot(),
# width = 10, height = 8)
The a is always lower than the I, but we can see that here the relationship between small and large can size is sometimes flipped to the general tendency in the plot above.
Let’s have a look how the different speakers are behaving.
print(f0_vowel_size_raw +
facet_wrap(~speaker) +
theme_bw() +
theme(axis.text = element_text(size = 12),
axis.title = element_text(size = 14,
face = "bold"),
legend.title = element_text(size = 14),
legend.text = element_text(size = 12)))
# ggsave(filename = paste0(plots, "f0_vowel_size_raw_speaker.pdf"),
# plot = last_plot(),
# width = 10, height = 8)
There is a lot of individual differences between the speakers. E.g. 25 shows barely an effect for vowel and an opposite effect for size, especially striking in a; similarly 12 shows no effect for size and only a small for vowel; speakers 22 and 27 have a similar pattern of their behavior, in line also with 14 and 19.
Let’s see how the movers behave:
print(f0_vowel_size_raw +
facet_wrap(~mover) +
theme_bw() +
theme(axis.text = element_text(size = 12),
axis.title = element_text(size = 14,
face = "bold"),
legend.title = element_text(size = 14),
legend.text = element_text(size = 12)))
# ggsave(filename = paste0(plots, "f0_vowel_size_raw_mover.pdf"),
# plot = last_plot(),
# width = 8, height = 5)
# Extracting posterior samples
post_maxH1_headPos <- mdl_maxH1_headPos %>%
gather_draws(b_vowel_s, b_can_size_s, b_head_pos, b_h_pos, b_height)
# Plotting intervals with densities
postplot_maxH1_headPos <-
ggplot(post_maxH1_headPos,
aes(x = .value, y = .variable)) +
stat_halfeye(.width = 0.95, fill = colorBlindBlack8[2], alpha = 0.7, size = 2) +
geom_segment(x = 0, xend = 0, y = -Inf, yend = Inf,
linetype = "dashed", color = "grey", size = 1) +
theme_bw() +
labs(x = "Posterior density", y = "Variable") +
scale_y_discrete(labels = c("Can size", "Horizontal pos.", "Head position", "Height", "Vowel")) + # Must be in alphabetical order
scale_x_continuous(limits = c(-0.12, 0.02))
print(postplot_maxH1_headPos +
theme(axis.text = element_text(size = 12),
axis.title = element_text(size = 14,
face = "bold")))
## Warning: Removed 31 rows containing missing values or values outside the scale range
## (`stat_slabinterval()`).
# ggsave(filename = paste0(plots, "postplot_maxH1_headPos.pdf"),
# plot = last_plot(),
# width = 10, height = 8)
Scatter plot with regression lines: f0 mean x vertical position
ggplot(can, aes(x = v_pos, y = f0_mean, color = factor(vowel))) +
geom_point(alpha = 0.5) +
stat_smooth(method = "lm", se = TRUE) +
labs(color = "Vowel", x = "Vertical Position", y = "f0 Mean (Hz)") +
theme_minimal() +
scale_color_manual(values = colorBlindBlack8)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 130 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 130 rows containing missing values or values outside the scale range
## (`geom_point()`).
In the past research, the relationship between the vertical position and F0 was frequently pointed to as a robust one, especially in perception, but also in production. Let’s see how it looks like in our data:
f0_vertical <- ggplot(can,
aes(y = f0_mean_z,
x = v_pos,
color = can_size)) +
stat_summary(geom = 'line',
fun = mean,
position = position_dodge(width = 0.5),
size = 2) +
stat_summary(position = position_dodge(width = 0.5),
size = 1 ) +
scale_color_manual(values=colorBlindBlack8) +
labs(x = "Vertical Position",
y = "f0 mean (z-normalized)",
color = "Can size") +
theme_bw()+
theme(axis.text = element_text(size = 12),
axis.title = element_text(size = 14,
face="bold"),
legend.title = element_text(size = 14),
legend.text = element_text(size = 12))
f0_vertical
ggsave(plot = f0_vertical,
filename = paste0(plots, "f0-vertical.pdf"),
width = 8, height = 5)
By vowel, z-normalized:
f0_vPos_vow_z <- ggplot(can,
aes(x = v_pos,
y = f0_mean_z,
color = factor(vowel))) +
geom_point(alpha = 0.4) +
stat_smooth(size = 1,
method = "lm",
se = TRUE) +
labs(color = "Vowel", x = "Vertical Position", y = "f0 Mean (z-normalized)") +
theme_minimal() +
scale_color_manual(values = colorBlindBlack8)
print(f0_vPos_vow_z + theme_bw() +
theme(axis.text = element_text(size = 12),
axis.title = element_text(size = 14,
face = "bold"),
legend.title = element_text(size = 14),
legend.text = element_text(size = 12)))
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 130 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 130 rows containing missing values or values outside the scale range
## (`geom_point()`).
# ggsave(filename = paste0(plots, "f0_vPos_vow_z.pdf"),
# plot = last_plot(),
# width = 8, height = 5)
By vowels, raw values:
f0_vPos_vow_raw <- ggplot(can,
aes(x = v_pos,
y = f0_mean,
color = factor(vowel))) +
geom_point(alpha = 0.4) +
stat_smooth(size = 1,
method = "lm",
se = TRUE) +
labs(color = "Vowel", x = "Vertical Position", y = "f0 Mean (Hz)") +
theme_minimal() +
scale_color_manual(values = colorBlindBlack8)
print(f0_vPos_vow_raw + theme_bw() +
theme(axis.text = element_text(size = 12),
axis.title = element_text(size = 14,
face = "bold"),
legend.title = element_text(size = 14),
legend.text = element_text(size = 12)))
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 130 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 130 rows containing missing values or values outside the scale range
## (`geom_point()`).
# ggsave(filename = paste0(plots, "f0_vPos_vow_raw.pdf"),
# plot = last_plot(),
# width = 8, height = 5)
By size, z:
f0_vPos_size_z <- ggplot(can,
aes(x = v_pos,
y = f0_mean_z,
color = can_size)) +
geom_point(alpha = 0.4) +
geom_smooth(size = 1,
method = lm,
se = T) +
scale_color_manual(values=colorBlindBlack8) +
labs(x = "Vertical Position",
y = "f0 Mean (z-normalized)",
color = "Can Size")
print(f0_vPos_size_z + theme_bw() +
theme(axis.text = element_text(size = 12),
axis.title = element_text(size = 14,
face = "bold"),
legend.title = element_text(size = 14),
legend.text = element_text(size = 12)))
# ggsave(filename = paste0(plots, "f0_vPos_size_z.pdf"),
# plot = last_plot(),
# width = 8, height = 5)
By size, raw:
f0_vPos_size_raw <- ggplot(can,
aes(x = v_pos,
y = f0_mean,
color = can_size)) +
geom_point(alpha = 0.4) +
geom_smooth(size = 1,
method = lm,
se = T) +
scale_color_manual(values=colorBlindBlack8) +
labs(x = "Vertical Position",
y = "f0 Mean (Hz)",
color = "Can Size")
print(f0_vPos_size_raw + theme_bw() +
theme(axis.text = element_text(size = 12),
axis.title = element_text(size = 14,
face = "bold"),
legend.title = element_text(size = 14),
legend.text = element_text(size = 12)))
# ggsave(filename = paste0(plots, "f0_vPos_size_raw.pdf"),
# plot = last_plot(),
# width = 8, height = 5)
By mover, z:
f0_vPos_size_raw <- ggplot(can,
aes(x = v_pos,
y = f0_mean_z,
color = mover)) +
geom_point(alpha = 0.4) +
geom_smooth(size = 1,
method = lm,
se = T) +
scale_color_manual(values=colorBlindBlack8) +
labs(x = "Vertical Position",
y = "f0 Mean (z-normalized)",
color = "Mover")
print(f0_vPos_size_raw + theme_bw() +
theme(axis.text = element_text(size = 12),
axis.title = element_text(size = 14,
face = "bold"),
legend.title = element_text(size = 14),
legend.text = element_text(size = 12)))
# ggsave(filename = paste0(plots, "f0_vPos_size_raw.pdf"),
# plot = last_plot(),
# width = 8, height = 5)
By mover, raw:
f0_vPos_mover_raw <- ggplot(can,
aes(x = v_pos,
y = f0_mean,
color = mover)) +
geom_point(alpha = 0.4) +
geom_smooth(size = 1,
method = lm,
se = T) +
scale_color_manual(values=colorBlindBlack8) +
labs(x = "Vertical Position",
y = "f0 Mean (Hz)",
color = "Mover")
print(f0_vPos_mover_raw + theme_bw() +
theme(axis.text = element_text(size = 12),
axis.title = element_text(size = 14,
face = "bold"),
legend.title = element_text(size = 14),
legend.text = element_text(size = 12)))
# ggsave(filename = paste0(plots, "f0_vPos_mover_raw.pdf"),
# plot = last_plot(),
# width = 8, height = 5)
# Extracting posterior samples
post_maxH1_vPos <- mdl_maxH1_vPos %>%
gather_draws(b_vowel_s, b_can_size_s, b_v_pos, b_h_pos, b_height)
# Plotting intervals with densities
postplot_maxH1_vPos <-
ggplot(post_maxH1_vPos,
aes(x = .value, y = .variable)) +
stat_halfeye(.width = 0.95, fill = colorBlindBlack8[2], alpha = 0.7, size = 2) +
geom_segment(x = 0, xend = 0, y = -Inf, yend = Inf,
linetype = "dashed", color = "grey", size = 1) +
theme_bw() +
labs(x = "Posterior density", y = "Variable") +
scale_y_discrete(labels = c("Can size", "Horizontal pos.", "Height", "Vertical pos.", "Vowel")) + # Must be in alphabetical order
scale_x_continuous(limits = c(-0.13, 0.01))
print(postplot_maxH1_vPos +
theme(axis.text = element_text(size = 12),
axis.title = element_text(size = 14,
face = "bold")))
## Warning: Removed 36 rows containing missing values or values outside the scale range
## (`stat_slabinterval()`).
# ggsave(filename = paste0(plots, "postplot_maxH1_vPos.pdf"),
# plot = last_plot(),
# width = 10, height = 8)
The idea is to calculate the area of vowel space per speaker per size and compare the areas for large vs. small cans with a simple t-test to have an initial test of the hypothesis.
Solution found here: https://stackoverflow.com/questions/38782051/how-to-calculate-the-area-of-ellipse-drawn-by-ggplot2
Filter vowels and sizes to make plots and ellipsis calculations correct.
formants_a_large <- can %>%
filter(vowel == "a") %>%
filter(.,can_size == 'large')
formants_a_small <- can %>%
filter(vowel == "a") %>%
filter(.,can_size == 'small')
formants_i_large <- can %>%
filter(vowel == "I") %>%
filter(.,can_size == 'large')
formants_i_small <- can %>%
filter(vowel == "I") %>%
filter(.,can_size == 'small')
# Plot object
area_a_large = ggplot(formants_a_large,
aes(y = F1,
x = F2)) +
geom_point() +
stat_ellipse(segments = 201) # Default is 51. We use a finer grid for more accurate area.
# Get ellipse coordinates from plot
pb = ggplot_build(area_a_large)
## Warning: Removed 22 rows containing non-finite outside the scale range
## (`stat_ellipse()`).
el = pb$data[[2]][c("x", "y")]
# Center of ellipse
ctr = MASS::cov.trob(el)$center # Per @Roland's comment
# Calculate distance to center from each point on the ellipse
dist2center <- sqrt(rowSums((t(t(el) - ctr))^2))
# Calculate area of ellipse from semi-major and semi-minor axes.
# These are, respectively, the largest and smallest values of dist2center.
area_a_large <- pi*min(dist2center)*max(dist2center)
area_a_large
## [1] 172265
# Plot object
area_a_small = ggplot(formants_a_small,
aes(y = F1,
x = F2)) +
geom_point() +
stat_ellipse(segments = 201) # Default is 51. We use a finer grid for more accurate area.
# Get ellipse coordinates from plot
pb = ggplot_build(area_a_small)
## Warning: Removed 22 rows containing non-finite outside the scale range
## (`stat_ellipse()`).
el = pb$data[[2]][c("x", "y")]
# Center of ellipse
ctr = MASS::cov.trob(el)$center # Per @Roland's comment
# Calculate distance to center from each point on the ellipse
dist2center <- sqrt(rowSums((t(t(el) - ctr))^2))
# Calculate area of ellipse from semi-major and semi-minor axes.
# These are, respectively, the largest and smallest values of dist2center.
area_a_small <- pi*min(dist2center)*max(dist2center)
area_a_small
## [1] 177409.7
# Plot object
area_i_large = ggplot(formants_i_large,
aes(y = F1,
x = F2)) +
geom_point() +
stat_ellipse(segments = 201) # Default is 51. We use a finer grid for more accurate area.
# Get ellipse coordinates from plot
pb = ggplot_build(area_i_large)
## Warning: Removed 16 rows containing non-finite outside the scale range
## (`stat_ellipse()`).
el = pb$data[[2]][c("x", "y")]
# Center of ellipse
ctr = MASS::cov.trob(el)$center # Per @Roland's comment
# Calculate distance to center from each point on the ellipse
dist2center <- sqrt(rowSums((t(t(el) - ctr))^2))
# Calculate area of ellipse from semi-major and semi-minor axes.
# These are, respectively, the largest and smallest values of dist2center.
area_i_large <- pi*min(dist2center)*max(dist2center)
area_i_large
## [1] 125235.6
# Plot object
area_i_small = ggplot(formants_i_small,
aes(y = F1,
x = F2)) +
geom_point() +
stat_ellipse(segments = 201) # Default is 51. We use a finer grid for more accurate area.
# Get ellipse coordinates from plot
pb = ggplot_build(area_i_small)
## Warning: Removed 23 rows containing non-finite outside the scale range
## (`stat_ellipse()`).
el = pb$data[[2]][c("x", "y")]
# Center of ellipse
ctr = MASS::cov.trob(el)$center # Per @Roland's comment
# Calculate distance to center from each point on the ellipse
dist2center <- sqrt(rowSums((t(t(el) - ctr))^2))
# Calculate area of ellipse from semi-major and semi-minor axes.
# These are, respectively, the largest and smallest values of dist2center.
area_i_small <- pi*min(dist2center)*max(dist2center)
area_i_small
## [1] 117526.4
Create tibble from areas
areas <- tibble(
size_large = c(area_a_large, area_i_large),
size_small = c(area_a_small, area_i_small)
)
And run a t.test on areas
t.test(areas$size_large, areas$size_small, paired = T)
##
## Paired t-test
##
## data: areas$size_large and areas$size_small
## t = 0.19951, df = 1, p-value = 0.8746
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## -80379.72 82944.22
## sample estimates:
## mean difference
## 1282.253
t-test shows no difference between the vowel space of small and large cans.
Remove unused variables
rm(area_a_large,area_a_small,area_i_large,area_i_small,ctr,dist2center,pb,el)
As per reviewer comment, I will calculate formant dispersion and use it for a separate model.
can <- can %>%
mutate(f_disp = ((F2 - F1) + (F3 - F2)) / 2)
Let’s have a look.
can %>%
group_by(speaker, vowel) %>%
dplyr::summarize(mean_f_disp = mean(f_disp, na.rm = TRUE)) %>%
ungroup() %>%
print(n = Inf)
## `summarise()` has grouped output by 'speaker'. You can override using the
## `.groups` argument.
## # A tibble: 60 Ă— 3
## speaker vowel mean_f_disp
## <dbl> <chr> <dbl>
## 1 1 I 1116.
## 2 1 a 962.
## 3 2 I 1118.
## 4 2 a 1075.
## 5 3 I 1401.
## 6 3 a 1106.
## 7 4 I 1119.
## 8 4 a 890.
## 9 5 I 1124.
## 10 5 a 1058.
## 11 6 I 1074.
## 12 6 a 917.
## 13 7 I 1067.
## 14 7 a 938.
## 15 8 I 1131.
## 16 8 a 912.
## 17 9 I 1190.
## 18 9 a 964.
## 19 10 I 1233.
## 20 10 a 1007.
## 21 11 I 1065.
## 22 11 a 934.
## 23 12 I 1107.
## 24 12 a 1003.
## 25 13 I 1067.
## 26 13 a 687.
## 27 14 I 1077.
## 28 14 a 993.
## 29 15 I 1053.
## 30 15 a 955.
## 31 17 I 1152.
## 32 17 a 1042.
## 33 18 I 1338.
## 34 18 a 933.
## 35 19 I 1111.
## 36 19 a 1045.
## 37 20 I 1096.
## 38 20 a 952.
## 39 21 I 1171.
## 40 21 a 986.
## 41 22 I 1094.
## 42 22 a 940.
## 43 23 I 1181.
## 44 23 a 906.
## 45 24 I 1076.
## 46 24 a 973.
## 47 25 I 998.
## 48 25 a 949.
## 49 26 I 1141.
## 50 26 a 935.
## 51 27 I 1100.
## 52 27 a 1070.
## 53 28 I 1076.
## 54 28 a 927.
## 55 29 I 1026.
## 56 29 a 983.
## 57 30 I 1149.
## 58 30 a 962.
## 59 31 I 1075.
## 60 31 a 982.
There are two possible dependent variables to be tested: F1 and jaw opening. Let’s see what is their correlation in our data:
cor.test(can$lip_dist, can$F1)
##
## Pearson's product-moment correlation
##
## data: can$lip_dist and can$F1
## t = 31.959, df = 2926, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4813229 0.5350428
## sample estimates:
## cor
## 0.5086777
Plotting the raw values…
ggplot(can,
aes(x = lip_dist,
y = F1)) +
geom_point(alpha = 0.2) +
geom_smooth(size = 1,
method = lm,
se = F)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 72 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 72 rows containing missing values or values outside the scale range
## (`geom_point()`).
Very highly correlated, as expected. Let’s see about the formant dispersion, as suggested by one of the reviewers.
cor.test(can$lip_dist, can$f_disp)
##
## Pearson's product-moment correlation
##
## data: can$lip_dist and can$f_disp
## t = -30.373, df = 2861, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.5210013 -0.4655819
## sample estimates:
## cor
## -0.4937929
Plotting the raw values…
ggplot(can,
aes(x = lip_dist,
y = f_disp)) +
geom_point(alpha = 0.2) +
geom_smooth(size = 1,
method = lm,
se = F)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 137 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 137 rows containing missing values or values outside the scale range
## (`geom_point()`).
This is very interesting. There are some “whiskers” that are clearly not linear.
Given this interesting observation, I think it is fair to test both outcome variables: jaw opening and formant dispersion.
Here, I’m computing a model that estimates the probability of an effect of can size (among others) on lip distance and on formant dispersion. The first iteration of this analysis was done with the help of Timo Roettger. Since then, however, the analysis has been upgraded with the comments of three anonymous reviewers and Bodo Winter.
The variables are already contrast-coded.
We don’t know about the distribution of the lip distance and formant dispersion. This is important for choosing the appropriate family.
# Histogram for lip_dist
ggplot(can, aes(x = lip_dist)) +
geom_histogram(bins = 30, fill = "blue", color = "black") +
labs(title = "Distribution of lip_dist", x = "lip_dist", y = "Frequency") +
theme_minimal()
## Warning: Removed 30 rows containing non-finite outside the scale range
## (`stat_bin()`).
# looks kind of normal
# Histogram for f_disp
ggplot(can, aes(x = f_disp)) +
geom_histogram(bins = 30, fill = "blue", color = "black") +
labs(title = "Distribution of f_disp", x = "f_disp", y = "Frequency") +
theme_minimal()
## Warning: Removed 108 rows containing non-finite outside the scale range
## (`stat_bin()`).
# this is skewed, I would opt for lognormal
# Descriptive Statistics
summary(can$lip_dist)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 3.221 3.947 4.249 4.262 4.546 5.495 30
summary(can$f_disp)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 427.6 964.2 1044.4 1044.5 1119.4 1521.9 108
While lip distance looks normally distributed, formant dispersion is definitely skewed. I am also a little suspicious of the “normality” in lip_disp. We will fit the priors accordingly to this.
We will also turn to the maximal model in both cases instantly and only if the model will not perform well, we will seek an alternative.
For each model, I will fit weakly informative priors.
I divide in two streams: lip distance and formant dispersion.
Since this one will be lognormal, let’s look what values we can expect.
# In intercept is 1000
log(4)
## [1] 1.386294
# If we set the prior to normal(0,1), what would be the boundaries
exp(log(4)-.5); exp(log(4)+.5)
## [1] 2.426123
## [1] 6.594885
# If we set the prior to normal(0,2), what would be the boundaries
exp(log(4)-1); exp(log(4)+1)
## [1] 1.471518
## [1] 10.87313
# If we set the prior to normal(0,3), what would be the boundaries
exp(log(4)-1.5); exp(log(4)+1.5)
## [1] 0.8925206
## [1] 17.92676
get_prior(formula = lip_dist ~ 1 + vowel_s + can_size_s + head_pos + h_pos + v_pos + height + # head_pos and v_pos will be separated
(1 + vowel_s + can_size_s + head_pos + h_pos + v_pos || speaker),
data = can,
family = lognormal())
## Warning: Rows containing NAs were excluded from the model.
I will not specify any priors for the group-level effects.
priors_maxH2_lip <- c(
prior('normal(0, 3)', class = 'Intercept'),
prior('normal(0, 1)', class = b)
)
We have to fit both dependent variables with the two predictors head position and vertical position separately.
mdl_maxH2_lip_headPos <- brm(lip_dist ~
1 + vowel_s + can_size_s +
head_pos + h_pos + height +
(1 + vowel_s + can_size_s +
head_pos + h_pos || speaker),
data = can,
prior = priors_maxH2_lip,
family = lognormal(),
#backend = "cmdstanr",
cores = 4,
chains = 4,
iter = 8000,
warmup = 4000,
seed = 998,
save_pars = save_pars(all = TRUE),
control = list(max_treedepth = 13,
adapt_delta = 0.99),
file = paste0(models, "mdl_maxH2_lip_headPos.rds"))
# if we need to compress the model more
#saveRDS(mdl_maxH2_lip_headPos, file = paste0(models, "mdl_maxH2_lip_headPos.rds"), compress = "xz")
mdl_maxH2_lip_headPos <- readRDS(paste0(models, "mdl_maxH2_lip_headPos.rds"))
# beepr::beep()
Let’s check how it looks like.
summary(mdl_maxH2_lip_headPos)
## Family: lognormal
## Links: mu = identity; sigma = identity
## Formula: lip_dist ~ 1 + vowel_s + can_size_s + head_pos + h_pos + height + (1 + vowel_s + can_size_s + head_pos + h_pos || speaker)
## Data: can (Number of observations: 2964)
## Draws: 4 chains, each with iter = 8000; warmup = 4000; thin = 1;
## total post-warmup draws = 16000
##
## Multilevel Hyperparameters:
## ~speaker (Number of levels: 30)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.06 0.02 0.01 0.10 1.00 2032 2338
## sd(vowel_s) 0.03 0.00 0.02 0.04 1.00 2844 5980
## sd(can_size_s) 0.00 0.00 0.00 0.01 1.00 3343 3019
## sd(head_pos) 0.00 0.00 0.00 0.00 1.00 1714 2020
## sd(h_pos) 0.00 0.00 0.00 0.00 1.00 6331 8111
##
## Regression Coefficients:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept 1.59 0.38 0.84 2.35 1.00 9117 9704
## vowel_s 0.10 0.01 0.09 0.11 1.00 1739 3122
## can_size_s 0.00 0.00 -0.00 0.00 1.00 16509 12925
## head_pos 0.01 0.00 0.01 0.01 1.00 14359 9245
## h_pos 0.00 0.00 -0.00 0.00 1.00 12208 12357
## height -0.01 0.00 -0.01 -0.00 1.00 9271 10719
##
## Further Distributional Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma 0.03 0.00 0.03 0.03 1.00 21519 11798
##
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
plot(mdl_maxH2_lip_headPos)
conditional_effects(mdl_maxH2_lip_headPos, sample_prior = "only")
pp_check(mdl_maxH2_lip_headPos, ndraws = 100)
Let’s compute the loo for model comparison.
# run loo mdl
if (file.exists(paste0(models, "mdl_maxH2_lip_headPos_loo.rds"))) {
mdl_maxH2_lip_headPos_loo <- readRDS(paste0(models, "mdl_maxH2_lip_headPos_loo.rds"))
} else {
mdl_maxH2_lip_headPos_loo <- loo(mdl_maxH2_lip_headPos, moment_match = TRUE)
saveRDS(mdl_maxH2_lip_headPos_loo, paste0(models, "mdl_maxH2_lip_headPos_loo.rds"))
}
Check the loo.
mdl_maxH2_lip_headPos_loo
##
## Computed from 16000 by 2964 log-likelihood matrix.
##
## Estimate SE
## elpd_loo 2082.1 56.4
## p_loo 94.0 4.1
## looic -4164.1 112.7
## ------
## MCSE of elpd_loo is 0.1.
##
## All Pareto k estimates are good (k < 0.7).
## See help('pareto-k-diagnostic') for details.
See if any reloo is needed.
if (file.exists(paste0(models, "mdl_maxH2_lip_headPos_reloo.rds"))) {
mdl_maxH2_lip_headPos_reloo <- readRDS(paste0(models, "mdl_maxH2_lip_headPos_reloo.rds"))
} else {
mdl_maxH2_lip_headPos_reloo <- reloo(mdl_maxH2_lip_headPos_loo, mdl_maxH2_lip_headPos, chains = 1)
saveRDS(mdl_maxH2_lip_headPos_reloo, paste0(models, "mdl_maxH2_lip_headPos_reloo.rds"))
}
mdl_maxH2_lip_headPos_reloo
##
## Computed from 16000 by 2964 log-likelihood matrix.
##
## Estimate SE
## elpd_loo 2082.1 56.4
## p_loo 94.0 4.1
## looic -4164.1 112.7
## ------
## MCSE of elpd_loo is 0.1.
##
## All Pareto k estimates are good (k < 0.7).
## See help('pareto-k-diagnostic') for details.
mdl_maxH2_lip_vPos <- brm(lip_dist ~ 1 + vowel_s + can_size_s +
h_pos + v_pos + height +
(1 + vowel_s + can_size_s +
h_pos + v_pos || speaker),
data = can,
prior = priors_maxH2_lip,
family = lognormal(),
#backend = "cmdstanr",
cores = 4,
chains = 4,
iter = 8000,
warmup = 4000,
seed = 998,
save_pars = save_pars(all = TRUE),
control = list(max_treedepth = 13,
adapt_delta = 0.99),
file = paste0(models, "mdl_maxH2_lip_vPos.rds"))
# if we need to compress the model more
#saveRDS(mdl_maxH2_lip_vPos, file = paste0(models, "mdl_maxH2_lip_vPos.rds"), compress = "xz")
mdl_maxH2_lip_vPos <- readRDS(paste0(models, "mdl_maxH2_lip_vPos.rds"))
# beepr::beep()
Let’s check how it looks like.
summary(mdl_maxH2_lip_vPos)
## Family: lognormal
## Links: mu = identity; sigma = identity
## Formula: lip_dist ~ 1 + vowel_s + can_size_s + h_pos + v_pos + height + (1 + vowel_s + can_size_s + h_pos + v_pos || speaker)
## Data: can (Number of observations: 2970)
## Draws: 4 chains, each with iter = 8000; warmup = 4000; thin = 1;
## total post-warmup draws = 16000
##
## Multilevel Hyperparameters:
## ~speaker (Number of levels: 30)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.08 0.01 0.06 0.11 1.00 2934 5208
## sd(vowel_s) 0.03 0.00 0.02 0.04 1.00 3385 6365
## sd(can_size_s) 0.00 0.00 0.00 0.01 1.00 4283 3450
## sd(h_pos) 0.00 0.00 0.00 0.00 1.00 6909 8219
## sd(v_pos) 0.00 0.00 0.00 0.00 1.00 7994 11234
##
## Regression Coefficients:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept 1.59 0.34 0.91 2.26 1.00 2602 4580
## vowel_s 0.10 0.01 0.09 0.11 1.00 1764 3139
## can_size_s 0.00 0.00 -0.00 0.00 1.00 16528 12544
## h_pos 0.00 0.00 -0.00 0.00 1.00 13969 12227
## v_pos -0.01 0.00 -0.01 -0.00 1.00 9806 10948
## height -0.00 0.00 -0.00 0.00 1.00 2592 4560
##
## Further Distributional Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma 0.03 0.00 0.03 0.03 1.00 23486 11333
##
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
plot(mdl_maxH2_lip_vPos)
conditional_effects(mdl_maxH2_lip_vPos, sample_prior = "only")
pp_check(mdl_maxH2_lip_vPos, ndraws = 100)
Let’s compute the loo for model comparison.
# run loo mdl
if (file.exists(paste0(models, "mdl_maxH2_lip_vPos_loo.rds"))) {
mdl_maxH2_lip_vPos_loo <- readRDS(paste0(models, "mdl_maxH2_lip_vPos_loo.rds"))
} else {
mdl_maxH2_lip_vPos_loo <- loo(mdl_maxH2_lip_vPos, moment_match = TRUE)
saveRDS(mdl_maxH2_lip_vPos_loo, paste0(models, "mdl_maxH2_lip_vPos_loo.rds"))
}
Check the loo.
mdl_maxH2_lip_vPos_loo
##
## Computed from 16000 by 2970 log-likelihood matrix.
##
## Estimate SE
## elpd_loo 2123.4 56.4
## p_loo 111.1 4.4
## looic -4246.8 112.7
## ------
## MCSE of elpd_loo is 0.1.
##
## All Pareto k estimates are good (k < 0.7).
## See help('pareto-k-diagnostic') for details.
We would expect:
First, let’s look at the summary again.
summary(mdl_maxH2_lip_headPos)
## Family: lognormal
## Links: mu = identity; sigma = identity
## Formula: lip_dist ~ 1 + vowel_s + can_size_s + head_pos + h_pos + height + (1 + vowel_s + can_size_s + head_pos + h_pos || speaker)
## Data: can (Number of observations: 2964)
## Draws: 4 chains, each with iter = 8000; warmup = 4000; thin = 1;
## total post-warmup draws = 16000
##
## Multilevel Hyperparameters:
## ~speaker (Number of levels: 30)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.06 0.02 0.01 0.10 1.00 2032 2338
## sd(vowel_s) 0.03 0.00 0.02 0.04 1.00 2844 5980
## sd(can_size_s) 0.00 0.00 0.00 0.01 1.00 3343 3019
## sd(head_pos) 0.00 0.00 0.00 0.00 1.00 1714 2020
## sd(h_pos) 0.00 0.00 0.00 0.00 1.00 6331 8111
##
## Regression Coefficients:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept 1.59 0.38 0.84 2.35 1.00 9117 9704
## vowel_s 0.10 0.01 0.09 0.11 1.00 1739 3122
## can_size_s 0.00 0.00 -0.00 0.00 1.00 16509 12925
## head_pos 0.01 0.00 0.01 0.01 1.00 14359 9245
## h_pos 0.00 0.00 -0.00 0.00 1.00 12208 12357
## height -0.01 0.00 -0.01 -0.00 1.00 9271 10719
##
## Further Distributional Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma 0.03 0.00 0.03 0.03 1.00 21519 11798
##
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
All hypotheses tested together:
hypothesis(mdl_maxH2_lip_headPos, c("vowel_s > 0", "head_pos > 0", "height < 0", "can_size_s > 0", "h_pos > 0"))
## Hypothesis Tests for class b:
## Hypothesis Estimate Est.Error CI.Lower CI.Upper Evid.Ratio Post.Prob
## 1 (vowel_s) > 0 0.10 0.01 0.09 0.11 Inf 1.00
## 2 (head_pos) > 0 0.01 0.00 0.01 0.01 Inf 1.00
## 3 (height) < 0 -0.01 0.00 -0.01 0.00 515.13 1.00
## 4 (can_size_s) > 0 0.00 0.00 0.00 0.00 2.08 0.67
## 5 (h_pos) > 0 0.00 0.00 0.00 0.00 7.69 0.88
## Star
## 1 *
## 2 *
## 3 *
## 4
## 5
## ---
## 'CI': 90%-CI for one-sided and 95%-CI for two-sided hypotheses.
## '*': For one-sided hypotheses, the posterior probability exceeds 95%;
## for two-sided hypotheses, the value tested against lies outside the 95%-CI.
## Posterior probabilities of point hypotheses assume equal prior probabilities.
plot(hypothesis(mdl_maxH1_headPos, c("vowel_s < 0", "head_pos > 0", "height < 0", "can_size_s < 0", "h_pos > 0")))
A summary of the findings from the data, the priors, and the model:
Change to report:
summary(can$lip_dist) # the unit is centimeters
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 3.221 3.947 4.249 4.262 4.546 5.495 30
exp(fixef(mdl_maxH2_lip_headPos))
## Estimate Est.Error Q2.5 Q97.5
## Intercept 4.9145687 1.462363 2.3263929 10.5356796
## vowel_s 1.1057555 1.005741 1.0937317 1.1185901
## can_size_s 1.0006038 1.001344 0.9979718 1.0032646
## head_pos 1.0063235 1.000428 1.0054806 1.0071730
## h_pos 1.0006824 1.000582 0.9995219 1.0017971
## height 0.9930818 1.002310 0.9884786 0.9975418
exp(fixef(mdl_maxH2_lip_headPos)[1,1]) # intercept in cm
## [1] 4.914569
(exp(fixef(mdl_maxH2_lip_headPos)[1,1])*exp(fixef(mdl_maxH2_lip_headPos)[4,1]))-exp(fixef(mdl_maxH2_lip_headPos)[1,1]) # estimate head_pos in cm (positive), i.e., every 1 cm change in head_pos causes a change in lip_dist of
## [1] 0.03107739
(exp(fixef(mdl_maxH2_lip_headPos)[1,1])*exp(fixef(mdl_maxH2_lip_headPos)[2,1]))-exp(fixef(mdl_maxH2_lip_headPos)[1,1]) # estimate vowel_s in cm, i.e., difference between a and i is 5 mm
## [1] 0.5197426
(exp(fixef(mdl_maxH2_lip_headPos)[1,1])*exp(fixef(mdl_maxH2_lip_headPos)[6,1]))-exp(fixef(mdl_maxH2_lip_headPos)[1,1]) # estimate height in cm, i.e., every 1 cm change in height causes a change in lip_dist of
## [1] -0.03399996
We would expect:
First, let’s look at the summary again.
summary(mdl_maxH2_lip_vPos)
## Family: lognormal
## Links: mu = identity; sigma = identity
## Formula: lip_dist ~ 1 + vowel_s + can_size_s + h_pos + v_pos + height + (1 + vowel_s + can_size_s + h_pos + v_pos || speaker)
## Data: can (Number of observations: 2970)
## Draws: 4 chains, each with iter = 8000; warmup = 4000; thin = 1;
## total post-warmup draws = 16000
##
## Multilevel Hyperparameters:
## ~speaker (Number of levels: 30)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.08 0.01 0.06 0.11 1.00 2934 5208
## sd(vowel_s) 0.03 0.00 0.02 0.04 1.00 3385 6365
## sd(can_size_s) 0.00 0.00 0.00 0.01 1.00 4283 3450
## sd(h_pos) 0.00 0.00 0.00 0.00 1.00 6909 8219
## sd(v_pos) 0.00 0.00 0.00 0.00 1.00 7994 11234
##
## Regression Coefficients:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept 1.59 0.34 0.91 2.26 1.00 2602 4580
## vowel_s 0.10 0.01 0.09 0.11 1.00 1764 3139
## can_size_s 0.00 0.00 -0.00 0.00 1.00 16528 12544
## h_pos 0.00 0.00 -0.00 0.00 1.00 13969 12227
## v_pos -0.01 0.00 -0.01 -0.00 1.00 9806 10948
## height -0.00 0.00 -0.00 0.00 1.00 2592 4560
##
## Further Distributional Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma 0.03 0.00 0.03 0.03 1.00 23486 11333
##
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
All hypotheses tested together:
hypothesis(mdl_maxH2_lip_vPos, c("vowel_s > 0", "v_pos < 0", "height < 0", "can_size_s > 0", "h_pos > 0"))
## Hypothesis Tests for class b:
## Hypothesis Estimate Est.Error CI.Lower CI.Upper Evid.Ratio Post.Prob
## 1 (vowel_s) > 0 0.10 0.01 0.09 0.11 Inf 1.00
## 2 (v_pos) < 0 -0.01 0.00 -0.01 0.00 Inf 1.00
## 3 (height) < 0 0.00 0.00 0.00 0.00 1.87 0.65
## 4 (can_size_s) > 0 0.00 0.00 0.00 0.00 1.97 0.66
## 5 (h_pos) > 0 0.00 0.00 0.00 0.00 9.30 0.90
## Star
## 1 *
## 2 *
## 3
## 4
## 5
## ---
## 'CI': 90%-CI for one-sided and 95%-CI for two-sided hypotheses.
## '*': For one-sided hypotheses, the posterior probability exceeds 95%;
## for two-sided hypotheses, the value tested against lies outside the 95%-CI.
## Posterior probabilities of point hypotheses assume equal prior probabilities.
plot(hypothesis(mdl_maxH2_lip_vPos, c("vowel_s < 0", "v_pos < 0", "height < 0", "can_size_s < 0", "h_pos > 0")))
A summary of the findings from the data, the priors, and the model:
Change to report:
summary(can$lip_dist) # the unit is centimeters
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 3.221 3.947 4.249 4.262 4.546 5.495 30
exp(fixef(mdl_maxH2_lip_vPos))
## Estimate Est.Error Q2.5 Q97.5
## Intercept 4.8928233 1.404602 2.4837001 9.6256658
## vowel_s 1.1053027 1.005786 1.0926892 1.1177530
## can_size_s 1.0005345 1.001322 0.9979441 1.0031381
## h_pos 1.0007155 1.000550 0.9996248 1.0017984
## v_pos 0.9941635 1.000637 0.9929246 0.9954145
## height 0.9992331 1.002030 0.9952082 1.0032661
exp(fixef(mdl_maxH2_lip_vPos)[1,1]) # intercept in cm
## [1] 4.892823
(exp(fixef(mdl_maxH2_lip_vPos)[1,1])*exp(fixef(mdl_maxH2_lip_vPos)[4,1]))-exp(fixef(mdl_maxH2_lip_vPos)[1,1]) # estimate v_pos in cm (positive), i.e., every step change in v_pos causes a change in lip_dist of 0.03 mm
## [1] 0.003500904
(exp(fixef(mdl_maxH2_lip_vPos)[1,1])*exp(fixef(mdl_maxH2_lip_vPos)[2,1]))-exp(fixef(mdl_maxH2_lip_vPos)[1,1]) # estimate vowel_s in cm, i.e., difference between a and i is 5.1 mm
## [1] 0.5152275
Let’s see what’s the relationship between size and jaw opening:
lip_vowel_size <- ggplot(can,
aes(y = lip_dist,
x = vowel,
fill = can_size)) +
geom_violin(position = position_dodge(1),
trim = FALSE,
alpha = 0.4) +
# adds median and quartile:
geom_boxplot(width = 0.1,
position = position_dodge((1)),
alpha = 0.6,
outlier.shape = NA, # Hides outliers.
notch = T, # Notches are used to compare groups; if the notches of two boxes do not overlap, this suggests that the medians are significantly different.
coef = 0) + # Length of the whiskers as multiple of IQR. Defaults to 1.5.
scale_fill_manual(values = colorBlindBlack8) +
labs(x = "Vowel",
y = "Lip opening",
fill = "Can size") +
theme_bw()+
theme(axis.text = element_text(size = 12),
axis.title = element_text(size = 14, face = "bold"),
legend.title = element_text(size = 14),
legend.text = element_text(size = 12))
lip_vowel_size
# ggsave(plot = lip,
# filename = paste0(plots, "lip_vowel_size.pdf"),
# width = 8, height = 5)
The lower and upper hinges correspond to the first and third quartiles (the 25th and 75th percentiles). There are no whiskers to make the plot more clear.
The vowel does make a big difference, but the size has no effect.
Head position model:
# Extracting posterior samples
post_maxH2_lip_headPos <- mdl_maxH2_lip_headPos %>%
gather_draws(b_vowel_s, b_can_size_s, b_head_pos, b_h_pos, b_height)
# Plotting intervals with densities
postplot_maxH2_lip_headPos <-
ggplot(post_maxH2_lip_headPos,
aes(x = .value, y = .variable)) +
stat_halfeye(.width = 0.95, fill = colorBlindBlack8[2], alpha = 0.7, size = 2) +
geom_segment(x = 0, xend = 0, y = -Inf, yend = Inf,
linetype = "dashed", color = "grey", size = 1) +
theme_bw() +
labs(x = "Posterior density", y = "Variable") +
scale_y_discrete(labels = c("Can size", "Horizontal pos.", "Head position", "Height", "Vowel")) #+ # Must be in alphabetical order
#scale_x_continuous(limits = c(-0.13, 0.01))
print(postplot_maxH2_lip_headPos +
theme(axis.text = element_text(size = 12),
axis.title = element_text(size = 14,
face = "bold")))
# ggsave(filename = paste0(plots, "postplot_maxH2_lip_headPos.pdf"),
# plot = last_plot(),
# width = 10, height = 8)
Vertical position model:
# Extracting posterior samples
post_maxH2_lip_vPos <- mdl_maxH2_lip_vPos %>%
gather_draws(b_vowel_s, b_can_size_s, b_v_pos, b_h_pos, b_height)
# Plotting intervals with densities
postplot_maxH2_lip_vPos <-
ggplot(post_maxH2_lip_vPos,
aes(x = .value, y = .variable)) +
stat_halfeye(.width = 0.95, fill = colorBlindBlack8[2], alpha = 0.7, size = 2) +
geom_segment(x = 0, xend = 0, y = -Inf, yend = Inf,
linetype = "dashed", color = "grey", size = 1) +
theme_bw() +
labs(x = "Posterior density", y = "Variable") +
scale_y_discrete(labels = c("Can size", "Horizontal pos.", "Height", "Vertical pos.", "Vowel")) #+ # Must be in alphabetical order
#scale_x_continuous(limits = c(-0.13, 0.01))
print(postplot_maxH2_lip_vPos +
theme(axis.text = element_text(size = 12),
axis.title = element_text(size = 14,
face = "bold")))
# ggsave(filename = paste0(plots, "postplot_maxH2_lip_vPos.pdf"),
# plot = last_plot(),
# width = 10, height = 8)
Since this one will be lognormal, let’s look what values we can expect.
# In intercept is 1000
log(1000)
## [1] 6.907755
# If we set the prior to normal(0,5), what would be the boundaries
exp(log(1000)-2.5); exp(log(1000)+2.5)
## [1] 82.085
## [1] 12182.49
# If we set the prior to normal(0,4), what would be the boundaries
exp(log(1000)-2); exp(log(1000)+2)
## [1] 135.3353
## [1] 7389.056
# If we set the prior to normal(0,3), what would be the boundaries
exp(log(1000)-1.5); exp(log(1000)+1.5)
## [1] 223.1302
## [1] 4481.689
get_prior(formula = f_disp ~ 1 + vowel_s + can_size_s + head_pos + h_pos + v_pos + height + # head_pos and v_pos will be separated
(1 + vowel_s + can_size_s + head_pos + h_pos + v_pos || speaker),
data = can,
family = lognormal())
## Warning: Rows containing NAs were excluded from the model.
I will not specify any priors for the group-level effects.
priors_maxH2_form <- c(
prior('normal(0, 4)', class = 'Intercept'),
prior('normal(0, 1)', class = b)
)
We have to fit both dependent variables with the two predictors head position and vertical position separately.
mdl_maxH2_form_headPos <- brm(f_disp ~
1 + vowel_s + can_size_s +
head_pos + h_pos + height +
(1 + vowel_s + can_size_s +
head_pos + h_pos || speaker),
data = can,
prior = priors_maxH2_form,
family = lognormal(),
#backend = "cmdstanr",
cores = 4,
chains = 4,
iter = 8000,
warmup = 4000,
seed = 998,
save_pars = save_pars(all = TRUE),
control = list(max_treedepth = 13,
adapt_delta = 0.99),
file = paste0(models, "mdl_maxH2_form_headPos.rds"))
# if we need to compress the model more
#saveRDS(mdl_maxH2_form_headPos, file = paste0(models, "mdl_maxH2_form_headPos.rds"), compress = "xz")
mdl_maxH2_form_headPos <- readRDS(paste0(models, "mdl_maxH2_form_headPos.rds"))
beepr::beep()
Let’s check how it looks like.
summary(mdl_maxH2_form_headPos)
## Family: lognormal
## Links: mu = identity; sigma = identity
## Formula: f_disp ~ 1 + vowel_s + can_size_s + head_pos + h_pos + height + (1 + vowel_s + can_size_s + head_pos + h_pos || speaker)
## Data: can (Number of observations: 2886)
## Draws: 4 chains, each with iter = 8000; warmup = 4000; thin = 1;
## total post-warmup draws = 16000
##
## Multilevel Hyperparameters:
## ~speaker (Number of levels: 30)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.04 0.02 0.00 0.08 1.00 1822 6526
## sd(vowel_s) 0.10 0.01 0.08 0.13 1.00 3821 6752
## sd(can_size_s) 0.00 0.00 0.00 0.01 1.00 9653 8784
## sd(head_pos) 0.00 0.00 0.00 0.00 1.00 2061 4274
## sd(h_pos) 0.01 0.00 0.00 0.01 1.00 7263 10737
##
## Regression Coefficients:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept 7.15 0.32 6.52 7.77 1.00 17284 12307
## vowel_s -0.15 0.02 -0.19 -0.12 1.00 1960 3929
## can_size_s 0.00 0.00 -0.00 0.01 1.00 27033 12258
## head_pos 0.00 0.00 -0.00 0.00 1.00 19511 11090
## h_pos 0.00 0.00 -0.00 0.00 1.00 14213 12593
## height -0.00 0.00 -0.01 0.00 1.00 17672 12264
##
## Further Distributional Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma 0.06 0.00 0.06 0.06 1.00 28845 11298
##
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
plot(mdl_maxH2_form_headPos)
conditional_effects(mdl_maxH2_form_headPos, sample_prior = "only")
pp_check(mdl_maxH2_form_headPos, ndraws = 100)
Let’s compute the loo for model comparison.
# run loo mdl
if (file.exists(paste0(models, "mdl_maxH2_form_headPos_loo.rds"))) {
mdl_maxH2_form_headPos_loo <- readRDS(paste0(models, "mdl_maxH2_form_headPos_loo.rds"))
} else {
mdl_maxH2_form_headPos_loo <- loo(mdl_maxH2_form_headPos, moment_match = TRUE)
saveRDS(mdl_maxH2_form_headPos_loo, paste0(models, "mdl_maxH2_form_headPos_loo.rds"))
}
Check the loo.
mdl_maxH2_form_headPos_loo
##
## Computed from 16000 by 2886 log-likelihood matrix.
##
## Estimate SE
## elpd_loo -15963.8 79.0
## p_loo 88.2 5.9
## looic 31927.6 158.0
## ------
## MCSE of elpd_loo is NA.
##
## Pareto k diagnostic values:
## Count Pct. Min. ESS
## (-Inf, 0.7] (good) 2885 100.0% 1114
## (0.7, 1] (bad) 1 0.0% <NA>
## (1, Inf) (very bad) 0 0.0% <NA>
## See help('pareto-k-diagnostic') for details.
See if any reloo is needed.
if (file.exists(paste0(models, "mdl_maxH2_form_headPos_reloo.rds"))) {
mdl_maxH2_form_headPos_reloo <- readRDS(paste0(models, "mdl_maxH2_form_headPos_reloo.rds"))
} else {
mdl_maxH2_form_headPos_reloo <- reloo(mdl_maxH2_form_headPos_loo, mdl_maxH2_form_headPos, chains = 1)
saveRDS(mdl_maxH2_form_headPos_reloo, paste0(models, "mdl_maxH2_form_headPos_reloo.rds"))
}
mdl_maxH2_form_headPos_reloo
##
## Computed from 16000 by 2886 log-likelihood matrix.
##
## Estimate SE
## elpd_loo -15963.8 79.0
## p_loo 88.8 6.2
## looic 31927.5 158.0
## ------
## MCSE of elpd_loo is 0.1.
##
## All Pareto k estimates are good (k < 0.7).
## See help('pareto-k-diagnostic') for details.
mdl_maxH2_form_vPos <- brm(f_disp ~ 1 + vowel_s + can_size_s +
h_pos + v_pos + height +
(1 + vowel_s + can_size_s +
h_pos + v_pos || speaker),
data = can,
prior = priors_maxH2_form,
family = lognormal(),
#backend = "cmdstanr",
cores = 4,
chains = 4,
iter = 8000,
warmup = 4000,
seed = 998,
save_pars = save_pars(all = TRUE),
control = list(max_treedepth = 13,
adapt_delta = 0.99),
file = paste0(models, "mdl_maxH2_form_vPos.rds"))
# if we need to compress the model more
#saveRDS(mdl_maxH2_form_vPos, file = paste0(models, "mdl_maxH2_form_vPos.rds"), compress = "xz")
mdl_maxH2_form_vPos <- readRDS(paste0(models, "mdl_maxH2_form_vPos.rds"))
beepr::beep()
Let’s check how it looks like.
summary(mdl_maxH2_form_vPos)
## Family: lognormal
## Links: mu = identity; sigma = identity
## Formula: f_disp ~ 1 + vowel_s + can_size_s + h_pos + v_pos + height + (1 + vowel_s + can_size_s + h_pos + v_pos || speaker)
## Data: can (Number of observations: 2892)
## Draws: 4 chains, each with iter = 8000; warmup = 4000; thin = 1;
## total post-warmup draws = 16000
##
## Multilevel Hyperparameters:
## ~speaker (Number of levels: 30)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.07 0.01 0.06 0.10 1.00 5435 8914
## sd(vowel_s) 0.10 0.01 0.08 0.13 1.00 3573 5819
## sd(can_size_s) 0.00 0.00 0.00 0.01 1.00 9836 8952
## sd(h_pos) 0.01 0.00 0.00 0.01 1.00 8476 11843
## sd(v_pos) 0.00 0.00 0.00 0.01 1.00 5129 3451
##
## Regression Coefficients:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept 7.24 0.31 6.63 7.84 1.00 4370 7701
## vowel_s -0.15 0.02 -0.19 -0.12 1.00 1591 3270
## can_size_s 0.00 0.00 -0.00 0.01 1.00 29250 11799
## h_pos 0.00 0.00 -0.00 0.00 1.00 11721 12107
## v_pos 0.00 0.00 -0.00 0.00 1.00 16935 12914
## height -0.00 0.00 -0.01 0.00 1.00 4459 7715
##
## Further Distributional Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma 0.06 0.00 0.06 0.06 1.00 25415 11216
##
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
plot(mdl_maxH2_form_vPos)
conditional_effects(mdl_maxH2_form_vPos, sample_prior = "only")
pp_check(mdl_maxH2_form_vPos, ndraws = 100)
Let’s compute the loo for model comparison.
# run loo mdl
if (file.exists(paste0(models, "mdl_maxH2_form_vPos_loo.rds"))) {
mdl_maxH2_form_vPos_loo <- readRDS(paste0(models, "mdl_maxH2_form_vPos_loo.rds"))
} else {
mdl_maxH2_form_vPos_loo <- loo(mdl_maxH2_form_vPos, moment_match = TRUE)
saveRDS(mdl_maxH2_form_vPos_loo, paste0(models, "mdl_maxH2_form_vPos_loo.rds"))
}
Check the loo.
mdl_maxH2_form_vPos_loo
##
## Computed from 16000 by 2892 log-likelihood matrix.
##
## Estimate SE
## elpd_loo -15992.3 80.1
## p_loo 102.6 6.9
## looic 31984.6 160.3
## ------
## MCSE of elpd_loo is 0.1.
##
## All Pareto k estimates are good (k < 0.7).
## See help('pareto-k-diagnostic') for details.
See if any reloo is needed.
if (file.exists(paste0(models, "mdl_maxH2_form_vPos_reloo.rds"))) {
mdl_maxH2_form_vPos_reloo <- readRDS(paste0(models, "mdl_maxH2_form_vPos_reloo.rds"))
} else {
mdl_maxH2_form_vPos_reloo <- reloo(mdl_maxH2_form_vPos_loo, mdl_maxH2_form_vPos, chains = 1)
saveRDS(mdl_maxH2_form_vPos_reloo, paste0(models, "mdl_maxH2_form_vPos_reloo.rds"))
}
mdl_maxH2_form_vPos_reloo
##
## Computed from 16000 by 2892 log-likelihood matrix.
##
## Estimate SE
## elpd_loo -15992.3 80.1
## p_loo 102.6 6.9
## looic 31984.6 160.3
## ------
## MCSE of elpd_loo is 0.1.
##
## All Pareto k estimates are good (k < 0.7).
## See help('pareto-k-diagnostic') for details.
We would expect:
First, let’s look at the summary again.
summary(mdl_maxH2_form_headPos)
## Family: lognormal
## Links: mu = identity; sigma = identity
## Formula: f_disp ~ 1 + vowel_s + can_size_s + head_pos + h_pos + height + (1 + vowel_s + can_size_s + head_pos + h_pos || speaker)
## Data: can (Number of observations: 2886)
## Draws: 4 chains, each with iter = 8000; warmup = 4000; thin = 1;
## total post-warmup draws = 16000
##
## Multilevel Hyperparameters:
## ~speaker (Number of levels: 30)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.04 0.02 0.00 0.08 1.00 1822 6526
## sd(vowel_s) 0.10 0.01 0.08 0.13 1.00 3821 6752
## sd(can_size_s) 0.00 0.00 0.00 0.01 1.00 9653 8784
## sd(head_pos) 0.00 0.00 0.00 0.00 1.00 2061 4274
## sd(h_pos) 0.01 0.00 0.00 0.01 1.00 7263 10737
##
## Regression Coefficients:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept 7.15 0.32 6.52 7.77 1.00 17284 12307
## vowel_s -0.15 0.02 -0.19 -0.12 1.00 1960 3929
## can_size_s 0.00 0.00 -0.00 0.01 1.00 27033 12258
## head_pos 0.00 0.00 -0.00 0.00 1.00 19511 11090
## h_pos 0.00 0.00 -0.00 0.00 1.00 14213 12593
## height -0.00 0.00 -0.01 0.00 1.00 17672 12264
##
## Further Distributional Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma 0.06 0.00 0.06 0.06 1.00 28845 11298
##
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
All hypotheses tested together:
hypothesis(mdl_maxH2_form_headPos, c("vowel_s < 0", "head_pos > 0", "height < 0", "can_size_s > 0", "h_pos > 0"))
## Hypothesis Tests for class b:
## Hypothesis Estimate Est.Error CI.Lower CI.Upper Evid.Ratio Post.Prob
## 1 (vowel_s) < 0 -0.15 0.02 -0.19 -0.12 Inf 1.00
## 2 (head_pos) > 0 0.00 0.00 0.00 0.00 8.51 0.89
## 3 (height) < 0 0.00 0.00 -0.01 0.00 6.87 0.87
## 4 (can_size_s) > 0 0.00 0.00 0.00 0.00 2.35 0.70
## 5 (h_pos) > 0 0.00 0.00 0.00 0.00 3.65 0.78
## Star
## 1 *
## 2
## 3
## 4
## 5
## ---
## 'CI': 90%-CI for one-sided and 95%-CI for two-sided hypotheses.
## '*': For one-sided hypotheses, the posterior probability exceeds 95%;
## for two-sided hypotheses, the value tested against lies outside the 95%-CI.
## Posterior probabilities of point hypotheses assume equal prior probabilities.
plot(hypothesis(mdl_maxH2_form_headPos, c("vowel_s < 0", "head_pos > 0", "height < 0", "can_size_s < 0", "h_pos > 0")))
A summary of the findings from the data, the priors, and the model:
Change to report:
summary(can$f_disp) # the unit is Hz
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 427.6 964.2 1044.4 1044.5 1119.4 1521.9 108
exp(fixef(mdl_maxH2_form_headPos))
## Estimate Est.Error Q2.5 Q97.5
## Intercept 1273.8718588 1.370393 681.8020182 2358.2324962
## vowel_s 0.8574150 1.019130 0.8254486 0.8896534
## can_size_s 1.0011842 1.002257 0.9968339 1.0056356
## head_pos 1.0010978 1.000875 0.9993983 1.0028228
## h_pos 1.0010231 1.001327 0.9983897 1.0036456
## height 0.9977047 1.002055 0.9936061 1.0017531
exp(fixef(mdl_maxH2_form_headPos)[1,1]) # intercept in Hz
## [1] 1273.872
(exp(fixef(mdl_maxH2_form_headPos)[1,1])*exp(fixef(mdl_maxH2_form_headPos)[4,1]))-exp(fixef(mdl_maxH2_form_headPos)[1,1]) # estimate head_pos in Hz (positive), i.e., every 1 cm change in head_pos causes a change in formant dispersion of... (not reliable though)
## [1] 1.398443
(exp(fixef(mdl_maxH2_form_headPos)[1,1])*exp(fixef(mdl_maxH2_form_headPos)[2,1]))-exp(fixef(mdl_maxH2_form_headPos)[1,1]) # estimate vowel_s in Hz (negative), i.e., the formant dispersion difference between a and i
## [1] -181.635
We would expect:
First, let’s look at the summary again.
summary(mdl_maxH2_form_vPos)
## Family: lognormal
## Links: mu = identity; sigma = identity
## Formula: f_disp ~ 1 + vowel_s + can_size_s + h_pos + v_pos + height + (1 + vowel_s + can_size_s + h_pos + v_pos || speaker)
## Data: can (Number of observations: 2892)
## Draws: 4 chains, each with iter = 8000; warmup = 4000; thin = 1;
## total post-warmup draws = 16000
##
## Multilevel Hyperparameters:
## ~speaker (Number of levels: 30)
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sd(Intercept) 0.07 0.01 0.06 0.10 1.00 5435 8914
## sd(vowel_s) 0.10 0.01 0.08 0.13 1.00 3573 5819
## sd(can_size_s) 0.00 0.00 0.00 0.01 1.00 9836 8952
## sd(h_pos) 0.01 0.00 0.00 0.01 1.00 8476 11843
## sd(v_pos) 0.00 0.00 0.00 0.01 1.00 5129 3451
##
## Regression Coefficients:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## Intercept 7.24 0.31 6.63 7.84 1.00 4370 7701
## vowel_s -0.15 0.02 -0.19 -0.12 1.00 1591 3270
## can_size_s 0.00 0.00 -0.00 0.01 1.00 29250 11799
## h_pos 0.00 0.00 -0.00 0.00 1.00 11721 12107
## v_pos 0.00 0.00 -0.00 0.00 1.00 16935 12914
## height -0.00 0.00 -0.01 0.00 1.00 4459 7715
##
## Further Distributional Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## sigma 0.06 0.00 0.06 0.06 1.00 25415 11216
##
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
All hypotheses tested together:
hypothesis(mdl_maxH2_form_vPos, c("vowel_s < 0", "v_pos > 0", "height < 0", "can_size_s > 0", "h_pos > 0"))
## Hypothesis Tests for class b:
## Hypothesis Estimate Est.Error CI.Lower CI.Upper Evid.Ratio Post.Prob
## 1 (vowel_s) < 0 -0.15 0.02 -0.18 -0.12 Inf 1.00
## 2 (v_pos) > 0 0.00 0.00 0.00 0.00 2.26 0.69
## 3 (height) < 0 0.00 0.00 0.00 0.00 5.04 0.83
## 4 (can_size_s) > 0 0.00 0.00 0.00 0.00 2.12 0.68
## 5 (h_pos) > 0 0.00 0.00 0.00 0.00 3.47 0.78
## Star
## 1 *
## 2
## 3
## 4
## 5
## ---
## 'CI': 90%-CI for one-sided and 95%-CI for two-sided hypotheses.
## '*': For one-sided hypotheses, the posterior probability exceeds 95%;
## for two-sided hypotheses, the value tested against lies outside the 95%-CI.
## Posterior probabilities of point hypotheses assume equal prior probabilities.
plot(hypothesis(mdl_maxH2_lip_vPos, c("vowel_s < 0", "v_pos < 0", "height < 0", "can_size_s < 0", "h_pos > 0")))
A summary of the findings from the data, the priors, and the model:
Change to report:
summary(can$f_disp) # the unit is centimeters
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 427.6 964.2 1044.4 1044.5 1119.4 1521.9 108
exp(fixef(mdl_maxH2_form_vPos))
## Estimate Est.Error Q2.5 Q97.5
## Intercept 1390.0757114 1.362517 756.3255561 2529.6729203
## vowel_s 0.8580838 1.018975 0.8270913 0.8901519
## can_size_s 1.0010377 1.002248 0.9966857 1.0054648
## h_pos 1.0009824 1.001339 0.9983541 1.0036175
## v_pos 1.0005207 1.001063 0.9984605 1.0026227
## height 0.9982249 1.001848 0.9946589 1.0018523
exp(fixef(mdl_maxH2_form_vPos)[1,1]) # intercept in cm
## [1] 1390.076
(exp(fixef(mdl_maxH2_form_vPos)[1,1])*exp(fixef(mdl_maxH2_form_vPos)[4,1]))-exp(fixef(mdl_maxH2_form_vPos)[1,1]) # estimate head_pos in Hz (positive), i.e., every 1 cm change in head_pos causes a change in formant dispersion of... (not reliable though)
## [1] 1.365651
(exp(fixef(mdl_maxH2_form_vPos)[1,1])*exp(fixef(mdl_maxH2_form_vPos)[2,1]))-exp(fixef(mdl_maxH2_form_vPos)[1,1]) # estimate vowel_s in Hz (negative), i.e., the formant dispersion difference between a and i
## [1] -197.2742
Scatter plot of F1 and F2:
scatter_formant_v <- ggplot(can,
aes(y = F2,
x = F1,
color = vowel)) +
geom_point(alpha = 0.2) +
stat_ellipse(segments = 51,
size = 1) +
scale_color_manual(values = colorBlindBlack8) +
labs(x = "F1",
y = "F2",
color = "Vowel")
print(scatter_formant_v +
facet_wrap(~can_size) +
theme_bw() +
theme(axis.text = element_text(size = 12),
axis.title = element_text(size = 16),
legend.title = element_text(size = 14),
legend.text = element_text(size = 12)))
The difference between the vowels is clearly visible, according to articulatory phonetics.
What happens if we switch plot the vowels on separate plots and sizes on the same? The t-test suggests there will be no difference, but let’s have a look.
scatter_formant_size <- ggplot(can,
aes(y = F2,
x = F1,
color = can_size)) +
facet_wrap(~vowel) +
geom_point(alpha = 0.2) +
stat_ellipse(segments = 51,
size = 1) +
scale_color_manual(values = colorBlindBlack8) +
labs(y = "F2",
x = "F1",
color = "Can size") +
theme_bw() +
theme(axis.text = element_text(size = 12),
axis.title = element_text(size = 16),
legend.title = element_text(size = 14),
legend.text = element_text(size = 12))
scatter_formant_size
# ggsave(plot = scatter_z_formant, filename = "size_vowel.pdf", width = 7, height = 4)
Exactly as already predicted by the t-test, there is no difference of vowels space between large and small cans.
I will show it again using the phonR package:
vowel_space <- with(can, plotVowels(F1, F2, vowel,
group = can_size,
plot.tokens = FALSE,
plot.means = TRUE,
pch.means = vowel,
cex.means = 2,
var.col.by = can_size,
var.sty.by = can_size,
ellipse.fill = T,
poly.line = T,
pretty = T,
col = colorBlindBlack8,
legend.kwd = "bottomleft",
cex.lab = 1.2,
cex.axis = 1
#main = "Z"
))
#ggsave(plot = vowel_space_Z, filename = "size-vowel-phonR.pdf", width = 7, height = 4)
Just for comparison, let’s have a look at bark scale instead of Z normalized values:
vowel_space_bark <- with(can, plotVowels(F1_bark, F2_bark, vowel,
group = can_size,
plot.tokens = FALSE,
plot.means = TRUE,
pch.means = vowel,
cex.means = 1.5,
var.col.by = can_size,
var.sty.by = can_size,
#ylim = c(10,3),
#xlim = c(15,8),
ellipse.fill = T,
poly.line = T,
pretty = T,
legend.kwd = "bottomleft",
main = "Bark"))
Just for the sake of it, we can also look how it looks in vertical position 1, where the effect looked most robust:
# First, filter the data:
formant_v1 <- filter(can, v_pos == '1')
# And then plot it:
size_vowel_v1 <- ggplot(formant_v1,
aes(y = F1,
x = F2,
color = can_size)) +
geom_point(alpha = 0.2) +
stat_ellipse(segments = 51,
size = 1) +
scale_color_manual(values=colorBlindBlack8) +
labs(x = "F1",
y = "F2",
color = "Can size") +
facet_wrap(~vowel) +
theme_bw() +
theme(axis.text = element_text(size = 12),
axis.title = element_text(size = 16),
legend.title = element_text(size = 14),
legend.text = element_text(size = 12))
size_vowel_v1
## Warning: Removed 17 rows containing non-finite outside the scale range
## (`stat_ellipse()`).
## Warning: Removed 17 rows containing missing values or values outside the scale range
## (`geom_point()`).
# ggsave(plot = size_vowel_v1, filename = "size_vowel_v1.pdf", width = 7, height = 4)
The difference is more visible, but still it is barely there.
p_fDisp_vowel_size <- ggplot(can,
aes(y = f_disp,
x = vowel,
fill = can_size)) +
geom_violin(position = position_dodge(1),
trim = FALSE,
alpha = 0.4) +
# adds median and quartile:
geom_boxplot(width = 0.1,
position = position_dodge((1)),
alpha = 0.6,
outlier.shape = NA, # Hides outliers.
notch = T, # Notches are used to compare groups; if the notches of two boxes do not overlap, this suggests that the medians are significantly different.
coef = 0) + # Length of the whiskers as multiple of IQR. Defaults to 1.5.
scale_fill_manual(values = colorBlindBlack8) +
labs(x = "Vowel",
y = "Formant dispersion",
fill = "Can size") +
theme_bw()+
theme(axis.text = element_text(size = 12),
axis.title = element_text(size = 16),
legend.title = element_text(size = 14),
legend.text = element_text(size = 12))
p_fDisp_vowel_size
# ggsave(plot = lip, filename = "lip.pdf", width = 6, height = 4)
The vowel makes a big difference, but the formant dispersion doesn’t seem to make any.
This concludes the analysis.